Support qwen2.5 3B for NPU & update related examples (#12438)
* update qwen2.5-3B * update convert * small fix * replace load_in_low_bit with low_bit * small fix
This commit is contained in:
		
							parent
							
								
									b633fbf26c
								
							
						
					
					
						commit
						b9abb8a285
					
				
					 8 changed files with 48 additions and 19 deletions
				
			
		| 
						 | 
				
			
			@ -6,7 +6,7 @@ In this directory, you will find a C++ example on how to run LLM models on Intel
 | 
			
		|||
| Model      | Model Link                                                    |
 | 
			
		||||
|------------|----------------------------------------------------------------|
 | 
			
		||||
| Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
 | 
			
		||||
| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) |
 | 
			
		||||
| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) |
 | 
			
		||||
| Llama2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) |
 | 
			
		||||
| Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) |
 | 
			
		||||
| MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |
 | 
			
		||||
| 
						 | 
				
			
			@ -35,9 +35,26 @@ pip install transformers==4.45.0 accelerate==0.33.0
 | 
			
		|||
We provide a [convert script](convert.py) under current directory, by running it, you can obtain the whole weights and configuration files which are required to run C++ example.
 | 
			
		||||
 | 
			
		||||
```cmd
 | 
			
		||||
:: to convert Qwen2.5-7b-Instruct
 | 
			
		||||
:: to convert Qwen2.5-7B-Instruct
 | 
			
		||||
python convert.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory <converted_model_path>
 | 
			
		||||
 | 
			
		||||
:: to convert Qwen2-1.5B-Instruct
 | 
			
		||||
python convert.py --repo-id-or-model-path Qwen/Qwen2-1.5B-Instruct --save-directory <converted_model_path>
 | 
			
		||||
 | 
			
		||||
:: to convert Qwen2.5-3B-Instruct
 | 
			
		||||
python convert.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --save-directory <converted_model_path> --low_bit "sym_int8"
 | 
			
		||||
 | 
			
		||||
:: to convert Llama-2-7b-chat-hf
 | 
			
		||||
python convert.py --repo-id-or-model-path meta-llama/Llama-2-7b-chat-hf --save-directory <converted_model_path>
 | 
			
		||||
 | 
			
		||||
:: to convert Meta-Llama-3-8B-Instruct
 | 
			
		||||
python convert.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --save-directory <converted_model_path>
 | 
			
		||||
 | 
			
		||||
:: to convert MiniCPM-1B-sft-bf16
 | 
			
		||||
python convert.py --repo-id-or-model-path openbmb/MiniCPM-1B-sft-bf16 --save-directory <converted_model_path>
 | 
			
		||||
 | 
			
		||||
:: to convert MiniCPM-2B-sft-bf16
 | 
			
		||||
python convert.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --save-directory <converted_model_path>
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Arguments info:
 | 
			
		||||
| 
						 | 
				
			
			@ -45,6 +62,7 @@ Arguments info:
 | 
			
		|||
- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted model will be saved into `SAVE_DIRECTORY`.
 | 
			
		||||
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 | 
			
		||||
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `960`.
 | 
			
		||||
- `--low_bit LOW_BIT`: Defines the low bit precision to quantize the model. It is default to be `sym_int4`.
 | 
			
		||||
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 | 
			
		||||
 | 
			
		||||
## 3. Build C++ Example `llm-npu-cli`
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -43,8 +43,8 @@ if __name__ == "__main__":
 | 
			
		|||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=960)
 | 
			
		||||
    parser.add_argument("--quantization_group_size", type=int, default=0)
 | 
			
		||||
    parser.add_argument('--load_in_low_bit', type=str, default="sym_int4",
 | 
			
		||||
                        help='Load in low bit to use')
 | 
			
		||||
    parser.add_argument('--low_bit', type=str, default="sym_int4",
 | 
			
		||||
                        help='Low bit precision to quantize the model')
 | 
			
		||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
| 
						 | 
				
			
			@ -54,7 +54,7 @@ if __name__ == "__main__":
 | 
			
		|||
    model = AutoModelForCausalLM.from_pretrained(model_path,
 | 
			
		||||
                                                 optimize_model=True,
 | 
			
		||||
                                                 pipeline=True,
 | 
			
		||||
                                                 load_in_low_bit=args.load_in_low_bit,
 | 
			
		||||
                                                 load_in_low_bit=args.low_bit,
 | 
			
		||||
                                                 max_context_len=args.max_context_len,
 | 
			
		||||
                                                 max_prompt_len=args.max_prompt_len,
 | 
			
		||||
                                                 quantization_group_size=args.quantization_group_size,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -10,7 +10,7 @@ In this directory, you will find examples on how to directly run HuggingFace `tr
 | 
			
		|||
| Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) |
 | 
			
		||||
| Llama3.2 | [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct), [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) |
 | 
			
		||||
| Qwen2 | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
 | 
			
		||||
| Qwen2.5 | [Qwen/Qwen2.5-7b-Instruct](https://huggingface.co/Qwen/Qwen2.5-7b-Instruct) |
 | 
			
		||||
| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) |
 | 
			
		||||
| Baichuan2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan-7B-Chat) |
 | 
			
		||||
| MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -58,11 +58,14 @@ python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct"
 | 
			
		|||
:: to run Llama-3.2-3B-Instruct
 | 
			
		||||
python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct"
 | 
			
		||||
 | 
			
		||||
:: to run Qwen2.5-7b-Instruct
 | 
			
		||||
:: to run Qwen2.5-7B-Instruct
 | 
			
		||||
python qwen.py
 | 
			
		||||
 | 
			
		||||
:: to run Qwen2-1.5b-Instruct
 | 
			
		||||
python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --load_in_low_bit "sym_int8"
 | 
			
		||||
:: to run Qwen2-1.5B-Instruct
 | 
			
		||||
python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low_bit "sym_int8"
 | 
			
		||||
 | 
			
		||||
:: to run Qwen2.5-3B-Instruct
 | 
			
		||||
python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low_bit "sym_int8"
 | 
			
		||||
 | 
			
		||||
:: to run Baichuan2-7B-Chat
 | 
			
		||||
python baichuan2.py
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -48,8 +48,8 @@ if __name__ == "__main__":
 | 
			
		|||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=960)
 | 
			
		||||
    parser.add_argument("--quantization_group_size", type=int, default=0)
 | 
			
		||||
    parser.add_argument('--load_in_low_bit', type=str, default="sym_int4",
 | 
			
		||||
                        help='Load in low bit to use')
 | 
			
		||||
    parser.add_argument('--low_bit', type=str, default="sym_int4",
 | 
			
		||||
                        help='Low bit precision to quantize the model')
 | 
			
		||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
			
		||||
    parser.add_argument("--disable-streaming", action="store_true", default=False)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -60,7 +60,7 @@ if __name__ == "__main__":
 | 
			
		|||
        model = AutoModelForCausalLM.from_pretrained(model_path,
 | 
			
		||||
                                                     optimize_model=True,
 | 
			
		||||
                                                     pipeline=True,
 | 
			
		||||
                                                     load_in_low_bit=args.load_in_low_bit,
 | 
			
		||||
                                                     load_in_low_bit=args.low_bit,
 | 
			
		||||
                                                     max_context_len=args.max_context_len,
 | 
			
		||||
                                                     max_prompt_len=args.max_prompt_len,
 | 
			
		||||
                                                     quantization_group_size=args.quantization_group_size,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -70,7 +70,7 @@ Arguments info:
 | 
			
		|||
- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
 | 
			
		||||
- `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `'Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun'`.
 | 
			
		||||
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 | 
			
		||||
- `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used.
 | 
			
		||||
- `--low_bit`: argument defining the `low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used.
 | 
			
		||||
 | 
			
		||||
### Sample Output
 | 
			
		||||
#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
 | 
			
		||||
| 
						 | 
				
			
			@ -90,6 +90,7 @@ The examples below show how to run the **_optimized HuggingFace model implementa
 | 
			
		|||
- [Llama3.2-1B](./llama.py)
 | 
			
		||||
- [Llama3.2-3B](./llama.py)
 | 
			
		||||
- [Qwen2-1.5B](./qwen.py)
 | 
			
		||||
- [Qwen2.5-3B](./qwen.py)
 | 
			
		||||
- [Qwen2.5-7B](./qwen.py)
 | 
			
		||||
- [MiniCPM-1B](./minicpm.py)
 | 
			
		||||
- [MiniCPM-2B](./minicpm.py)
 | 
			
		||||
| 
						 | 
				
			
			@ -122,6 +123,9 @@ python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct
 | 
			
		|||
:: to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715)
 | 
			
		||||
python qwen.py
 | 
			
		||||
 | 
			
		||||
:: to run Qwen2.5-3B-Instruct (LNL driver version: 32.0.101.2715)
 | 
			
		||||
python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8
 | 
			
		||||
 | 
			
		||||
:: to run Qwen2.5-7B-Instruct (LNL driver version: 32.0.101.2715)
 | 
			
		||||
python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -47,7 +47,10 @@ if __name__ == "__main__":
 | 
			
		|||
                        help='Prompt to infer')
 | 
			
		||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
			
		||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=960)
 | 
			
		||||
    parser.add_argument("--quantization_group_size", type=int, default=0)
 | 
			
		||||
    parser.add_argument('--low_bit', type=str, default="sym_int4",
 | 
			
		||||
                        help='Load in low bit to use')
 | 
			
		||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
			
		||||
    parser.add_argument("--intra-pp", type=int, default=None)
 | 
			
		||||
    parser.add_argument("--inter-pp", type=int, default=None)
 | 
			
		||||
| 
						 | 
				
			
			@ -62,14 +65,15 @@ if __name__ == "__main__":
 | 
			
		|||
            torch_dtype=torch.float16,
 | 
			
		||||
            trust_remote_code=True,
 | 
			
		||||
            attn_implementation="eager",
 | 
			
		||||
            load_in_low_bit="sym_int4",
 | 
			
		||||
            load_in_low_bit=args.low_bit,
 | 
			
		||||
            optimize_model=True,
 | 
			
		||||
            max_context_len=args.max_context_len,
 | 
			
		||||
            max_prompt_len=args.max_prompt_len,
 | 
			
		||||
            intra_pp=args.intra_pp,
 | 
			
		||||
            inter_pp=args.inter_pp,
 | 
			
		||||
            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
			
		||||
            mixed_precision=args.mixed_precision
 | 
			
		||||
            mixed_precision=args.mixed_precision,
 | 
			
		||||
            quantization_group_size=args.quantization_group_size,
 | 
			
		||||
        )
 | 
			
		||||
    else:
 | 
			
		||||
        model = AutoModelForCausalLM.load_low_bit(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -64,7 +64,7 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
 | 
			
		|||
    iqtype = ggml_tensor_qtype[qtype]
 | 
			
		||||
    if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
 | 
			
		||||
        if qtype == "sym_int4_rtn":
 | 
			
		||||
            # workaround for qwen2 & int4
 | 
			
		||||
            # workaround for qwen2-7B & int4
 | 
			
		||||
            if (layer.in_features == 3584 and layer.out_features == 152064) or \
 | 
			
		||||
               (layer.in_features == 18944 and layer.out_features == 3584):
 | 
			
		||||
                qtype = "sym_int8_rtn"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -428,8 +428,8 @@ def optimize_llm(
 | 
			
		|||
                      intra_pp=intra_pp,
 | 
			
		||||
                      decoder=True,
 | 
			
		||||
                      transpose_value_cache=transpose_value_cache)
 | 
			
		||||
    elif model.config.model_type == "qwen2" and model.config.num_hidden_layers == 28:
 | 
			
		||||
        # for qwen2-1.5B and qwen2-7B
 | 
			
		||||
    elif model.config.model_type == "qwen2":
 | 
			
		||||
        # for qwen2-1.5B, qwen2-7B, qwen2.5-3B
 | 
			
		||||
        if intra_pp is None:
 | 
			
		||||
            intra_pp = 2
 | 
			
		||||
        if inter_pp is None:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue