diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py index a0d92a38..79786167 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py @@ -37,6 +37,12 @@ if __name__ == "__main__": help="The huggingface repo id for the Llama2 model to be downloaded" ", or the path to the huggingface checkpoint folder", ) + parser.add_argument("--lowbit-path", type=str, + default="", + help="The path to the lowbit model folder, leave blank if you do not want to save. \ + If path not exists, lowbit model will be saved there. \ + Else, lowbit model will be loaded.", + ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") @@ -48,23 +54,38 @@ if __name__ == "__main__": args = parser.parse_args() model_path = args.repo_id_or_model_path - - model = AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype=torch.float16, - trust_remote_code=True, - attn_implementation="eager", - load_in_low_bit="sym_int4", - optimize_model=True, - max_output_len=args.max_output_len, - max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, - transpose_value_cache=not args.disable_transpose_value_cache, - ) - + if not args.lowbit_path or not os.path.exists(args.lowbit_path): + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) + else: + model = AutoModelForCausalLM.load_low_bit( + args.lowbit_path, + attn_implementation="eager", + torch_dtype=torch.float16, + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + trust_remote_code=True, + ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + if args.lowbit_path and not os.path.exists(args.lowbit_path): + model.save_low_bit(args.lowbit_path) + print("-" * 80) print("done") with torch.inference_mode(): diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index f04f1317..09940348 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -270,7 +270,7 @@ class _BaseAutoModelClass: invalidInputError( qtype in ["sym_int8_rtn", "sym_int4_rtn"], f"Unknown bigdl_transformers_low_bit value: {qtype}," - f" expected: sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.", + f" expected: sym_int8_rtn, sym_int4_rtn. " ) has_remote_code = hasattr(config, "auto_map") and cls.HF_Model.__name__ in config.auto_map