Support qwen2.5 3B for NPU & update related examples (#12438)
* update qwen2.5-3B * update convert * small fix * replace load_in_low_bit with low_bit * small fix
This commit is contained in:
parent
b633fbf26c
commit
b9abb8a285
8 changed files with 48 additions and 19 deletions
|
|
@ -6,7 +6,7 @@ In this directory, you will find a C++ example on how to run LLM models on Intel
|
|||
| Model | Model Link |
|
||||
|------------|----------------------------------------------------------------|
|
||||
| Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
|
||||
| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) |
|
||||
| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) |
|
||||
| Llama2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) |
|
||||
| Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) |
|
||||
| MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |
|
||||
|
|
@ -35,9 +35,26 @@ pip install transformers==4.45.0 accelerate==0.33.0
|
|||
We provide a [convert script](convert.py) under current directory, by running it, you can obtain the whole weights and configuration files which are required to run C++ example.
|
||||
|
||||
```cmd
|
||||
:: to convert Qwen2.5-7b-Instruct
|
||||
:: to convert Qwen2.5-7B-Instruct
|
||||
python convert.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory <converted_model_path>
|
||||
|
||||
:: to convert Qwen2-1.5B-Instruct
|
||||
python convert.py --repo-id-or-model-path Qwen/Qwen2-1.5B-Instruct --save-directory <converted_model_path>
|
||||
|
||||
:: to convert Qwen2.5-3B-Instruct
|
||||
python convert.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --save-directory <converted_model_path> --low_bit "sym_int8"
|
||||
|
||||
:: to convert Llama-2-7b-chat-hf
|
||||
python convert.py --repo-id-or-model-path meta-llama/Llama-2-7b-chat-hf --save-directory <converted_model_path>
|
||||
|
||||
:: to convert Meta-Llama-3-8B-Instruct
|
||||
python convert.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --save-directory <converted_model_path>
|
||||
|
||||
:: to convert MiniCPM-1B-sft-bf16
|
||||
python convert.py --repo-id-or-model-path openbmb/MiniCPM-1B-sft-bf16 --save-directory <converted_model_path>
|
||||
|
||||
:: to convert MiniCPM-2B-sft-bf16
|
||||
python convert.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --save-directory <converted_model_path>
|
||||
```
|
||||
|
||||
Arguments info:
|
||||
|
|
@ -45,6 +62,7 @@ Arguments info:
|
|||
- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted model will be saved into `SAVE_DIRECTORY`.
|
||||
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
|
||||
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `960`.
|
||||
- `--low_bit LOW_BIT`: Defines the low bit precision to quantize the model. It is default to be `sym_int4`.
|
||||
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
|
||||
|
||||
## 3. Build C++ Example `llm-npu-cli`
|
||||
|
|
|
|||
|
|
@ -43,8 +43,8 @@ if __name__ == "__main__":
|
|||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=960)
|
||||
parser.add_argument("--quantization_group_size", type=int, default=0)
|
||||
parser.add_argument('--load_in_low_bit', type=str, default="sym_int4",
|
||||
help='Load in low bit to use')
|
||||
parser.add_argument('--low_bit', type=str, default="sym_int4",
|
||||
help='Low bit precision to quantize the model')
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
|
@ -54,7 +54,7 @@ if __name__ == "__main__":
|
|||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
optimize_model=True,
|
||||
pipeline=True,
|
||||
load_in_low_bit=args.load_in_low_bit,
|
||||
load_in_low_bit=args.low_bit,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
quantization_group_size=args.quantization_group_size,
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ In this directory, you will find examples on how to directly run HuggingFace `tr
|
|||
| Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) |
|
||||
| Llama3.2 | [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct), [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) |
|
||||
| Qwen2 | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
|
||||
| Qwen2.5 | [Qwen/Qwen2.5-7b-Instruct](https://huggingface.co/Qwen/Qwen2.5-7b-Instruct) |
|
||||
| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) |
|
||||
| Baichuan2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan-7B-Chat) |
|
||||
| MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |
|
||||
|
||||
|
|
@ -58,11 +58,14 @@ python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct"
|
|||
:: to run Llama-3.2-3B-Instruct
|
||||
python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct"
|
||||
|
||||
:: to run Qwen2.5-7b-Instruct
|
||||
:: to run Qwen2.5-7B-Instruct
|
||||
python qwen.py
|
||||
|
||||
:: to run Qwen2-1.5b-Instruct
|
||||
python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --load_in_low_bit "sym_int8"
|
||||
:: to run Qwen2-1.5B-Instruct
|
||||
python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low_bit "sym_int8"
|
||||
|
||||
:: to run Qwen2.5-3B-Instruct
|
||||
python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low_bit "sym_int8"
|
||||
|
||||
:: to run Baichuan2-7B-Chat
|
||||
python baichuan2.py
|
||||
|
|
|
|||
|
|
@ -48,8 +48,8 @@ if __name__ == "__main__":
|
|||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=960)
|
||||
parser.add_argument("--quantization_group_size", type=int, default=0)
|
||||
parser.add_argument('--load_in_low_bit', type=str, default="sym_int4",
|
||||
help='Load in low bit to use')
|
||||
parser.add_argument('--low_bit', type=str, default="sym_int4",
|
||||
help='Low bit precision to quantize the model')
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
parser.add_argument("--disable-streaming", action="store_true", default=False)
|
||||
|
||||
|
|
@ -60,7 +60,7 @@ if __name__ == "__main__":
|
|||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
optimize_model=True,
|
||||
pipeline=True,
|
||||
load_in_low_bit=args.load_in_low_bit,
|
||||
load_in_low_bit=args.low_bit,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
quantization_group_size=args.quantization_group_size,
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ Arguments info:
|
|||
- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
|
||||
- `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `'Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun'`.
|
||||
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
|
||||
- `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used.
|
||||
- `--low_bit`: argument defining the `low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used.
|
||||
|
||||
### Sample Output
|
||||
#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
|
||||
|
|
@ -90,6 +90,7 @@ The examples below show how to run the **_optimized HuggingFace model implementa
|
|||
- [Llama3.2-1B](./llama.py)
|
||||
- [Llama3.2-3B](./llama.py)
|
||||
- [Qwen2-1.5B](./qwen.py)
|
||||
- [Qwen2.5-3B](./qwen.py)
|
||||
- [Qwen2.5-7B](./qwen.py)
|
||||
- [MiniCPM-1B](./minicpm.py)
|
||||
- [MiniCPM-2B](./minicpm.py)
|
||||
|
|
@ -122,6 +123,9 @@ python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct
|
|||
:: to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715)
|
||||
python qwen.py
|
||||
|
||||
:: to run Qwen2.5-3B-Instruct (LNL driver version: 32.0.101.2715)
|
||||
python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8
|
||||
|
||||
:: to run Qwen2.5-7B-Instruct (LNL driver version: 32.0.101.2715)
|
||||
python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct
|
||||
|
||||
|
|
|
|||
|
|
@ -47,7 +47,10 @@ if __name__ == "__main__":
|
|||
help='Prompt to infer')
|
||||
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
|
||||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=512)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=960)
|
||||
parser.add_argument("--quantization_group_size", type=int, default=0)
|
||||
parser.add_argument('--low_bit', type=str, default="sym_int4",
|
||||
help='Load in low bit to use')
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
parser.add_argument("--intra-pp", type=int, default=None)
|
||||
parser.add_argument("--inter-pp", type=int, default=None)
|
||||
|
|
@ -62,14 +65,15 @@ if __name__ == "__main__":
|
|||
torch_dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
attn_implementation="eager",
|
||||
load_in_low_bit="sym_int4",
|
||||
load_in_low_bit=args.low_bit,
|
||||
optimize_model=True,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
transpose_value_cache=not args.disable_transpose_value_cache,
|
||||
mixed_precision=args.mixed_precision
|
||||
mixed_precision=args.mixed_precision,
|
||||
quantization_group_size=args.quantization_group_size,
|
||||
)
|
||||
else:
|
||||
model = AutoModelForCausalLM.load_low_bit(
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
|
|||
iqtype = ggml_tensor_qtype[qtype]
|
||||
if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
|
||||
if qtype == "sym_int4_rtn":
|
||||
# workaround for qwen2 & int4
|
||||
# workaround for qwen2-7B & int4
|
||||
if (layer.in_features == 3584 and layer.out_features == 152064) or \
|
||||
(layer.in_features == 18944 and layer.out_features == 3584):
|
||||
qtype = "sym_int8_rtn"
|
||||
|
|
|
|||
|
|
@ -428,8 +428,8 @@ def optimize_llm(
|
|||
intra_pp=intra_pp,
|
||||
decoder=True,
|
||||
transpose_value_cache=transpose_value_cache)
|
||||
elif model.config.model_type == "qwen2" and model.config.num_hidden_layers == 28:
|
||||
# for qwen2-1.5B and qwen2-7B
|
||||
elif model.config.model_type == "qwen2":
|
||||
# for qwen2-1.5B, qwen2-7B, qwen2.5-3B
|
||||
if intra_pp is None:
|
||||
intra_pp = 2
|
||||
if inter_pp is None:
|
||||
|
|
|
|||
Loading…
Reference in a new issue