[NPU C++] Update model support & examples & benchmark (#12466)
This commit is contained in:
		
							parent
							
								
									14d8d3d8af
								
							
						
					
					
						commit
						c911026f03
					
				
					 8 changed files with 176 additions and 79 deletions
				
			
		| 
						 | 
					@ -40,7 +40,7 @@ test_api:
 | 
				
			||||||
  # - "transformers_int4_npu_pipeline_win"  # on Intel NPU for Windows,  transformer-like API, (qtype=int4)
 | 
					  # - "transformers_int4_npu_pipeline_win"  # on Intel NPU for Windows,  transformer-like API, (qtype=int4)
 | 
				
			||||||
cpu_embedding: False # whether put embedding to CPU
 | 
					cpu_embedding: False # whether put embedding to CPU
 | 
				
			||||||
streaming: False # whether output in streaming way (only available now for gpu win related test_api)
 | 
					streaming: False # whether output in streaming way (only available now for gpu win related test_api)
 | 
				
			||||||
optimize_model: False # whether apply further optimization on NPU (only available now for transformers_int4_npu_win test_api)
 | 
					optimize_model: True # whether apply further optimization on NPU (only available now for transformers_int4_npu_win test_api)
 | 
				
			||||||
use_fp16_torch_dtype: True # whether use fp16 for non-linear layer (only available now for "pipeline_parallel_gpu" test_api)
 | 
					use_fp16_torch_dtype: True # whether use fp16 for non-linear layer (only available now for "pipeline_parallel_gpu" test_api)
 | 
				
			||||||
task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
 | 
					task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
 | 
				
			||||||
transpose_value_cache: True # whether apply transposed v_cache optimization on NPU (only available now for transformers_int4_npu_win test_api)
 | 
					transpose_value_cache: True # whether apply transposed v_cache optimization on NPU (only available now for transformers_int4_npu_win test_api)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -626,6 +626,7 @@ def transformers_int4_npu_win(repo_id,
 | 
				
			||||||
    model_path = get_model_path(repo_id, local_model_hub)
 | 
					    model_path = get_model_path(repo_id, local_model_hub)
 | 
				
			||||||
    in_out_len = in_out_pairs[0].split("-")
 | 
					    in_out_len = in_out_pairs[0].split("-")
 | 
				
			||||||
    max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
 | 
					    max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
 | 
				
			||||||
 | 
					    save_directory = "./save_converted_model_dir"
 | 
				
			||||||
    # Load model in 4 bit,
 | 
					    # Load model in 4 bit,
 | 
				
			||||||
    # which convert the relevant layers in the model into INT4 format
 | 
					    # which convert the relevant layers in the model into INT4 format
 | 
				
			||||||
    st = time.perf_counter()
 | 
					    st = time.perf_counter()
 | 
				
			||||||
| 
						 | 
					@ -640,13 +641,14 @@ def transformers_int4_npu_win(repo_id,
 | 
				
			||||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
 | 
				
			||||||
                                                     optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), 
 | 
					                                                     optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), 
 | 
				
			||||||
                                                     quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
 | 
					                                                     quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
 | 
				
			||||||
                                                     use_cache=True, attn_implementation="eager").eval()
 | 
					                                                     save_directory=save_directory, use_cache=True, attn_implementation="eager").eval()
 | 
				
			||||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
    end = time.perf_counter()
 | 
					    end = time.perf_counter()
 | 
				
			||||||
    load_time = end - st
 | 
					    load_time = end - st
 | 
				
			||||||
    print(">> loading of model costs {}s".format(load_time))
 | 
					    print(">> loading of model costs {}s".format(load_time))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    model = BenchmarkWrapper(model)
 | 
					    if not hasattr(model, "model_ptr"):
 | 
				
			||||||
 | 
					        model = BenchmarkWrapper(model)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    result = {}
 | 
					    result = {}
 | 
				
			||||||
    with torch.inference_mode():
 | 
					    with torch.inference_mode():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,7 +13,7 @@ In this directory, you will find examples on how to directly run HuggingFace `tr
 | 
				
			||||||
| Chatglm2 | [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b) |
 | 
					| Chatglm2 | [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b) |
 | 
				
			||||||
| Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
 | 
					| Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
 | 
				
			||||||
| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) |
 | 
					| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) |
 | 
				
			||||||
| MiniCPM | [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |
 | 
					| MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |
 | 
				
			||||||
| Phi-3 | [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) |
 | 
					| Phi-3 | [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) |
 | 
				
			||||||
| Stablelm | [stabilityai/stablelm-zephyr-3b](https://huggingface.co/stabilityai/stablelm-zephyr-3b) |
 | 
					| Stablelm | [stabilityai/stablelm-zephyr-3b](https://huggingface.co/stabilityai/stablelm-zephyr-3b) |
 | 
				
			||||||
| Baichuan2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan-7B-Chat) |
 | 
					| Baichuan2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan-7B-Chat) |
 | 
				
			||||||
| 
						 | 
					@ -85,10 +85,10 @@ done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## 4. Run Optimized Models (Experimental)
 | 
					## 4. Run Optimized Models (Experimental)
 | 
				
			||||||
The examples below show how to run the **_optimized HuggingFace model implementations_** on Intel NPU, including
 | 
					The examples below show how to run the **_optimized HuggingFace model implementations_** on Intel NPU, including
 | 
				
			||||||
- [Llama2-7B](./llama.py)
 | 
					- [Llama2-7B](./llama2.py)
 | 
				
			||||||
- [Llama3-8B](./llama.py)
 | 
					- [Llama3-8B](./llama3.py)
 | 
				
			||||||
- [Llama3.2-1B](./llama.py)
 | 
					- [Llama3.2-1B](./llama3.py)
 | 
				
			||||||
- [Llama3.2-3B](./llama.py)
 | 
					- [Llama3.2-3B](./llama3.py)
 | 
				
			||||||
- [Qwen2-1.5B](./qwen.py)
 | 
					- [Qwen2-1.5B](./qwen.py)
 | 
				
			||||||
- [Qwen2.5-3B](./qwen.py)
 | 
					- [Qwen2.5-3B](./qwen.py)
 | 
				
			||||||
- [Qwen2.5-7B](./qwen.py)
 | 
					- [Qwen2.5-7B](./qwen.py)
 | 
				
			||||||
| 
						 | 
					@ -96,44 +96,34 @@ The examples below show how to run the **_optimized HuggingFace model implementa
 | 
				
			||||||
- [MiniCPM-2B](./minicpm.py)
 | 
					- [MiniCPM-2B](./minicpm.py)
 | 
				
			||||||
- [Baichuan2-7B](./baichuan2.py)
 | 
					- [Baichuan2-7B](./baichuan2.py)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Recommended NPU Driver Version for MTL Users
 | 
					 | 
				
			||||||
#### 32.0.100.2540
 | 
					 | 
				
			||||||
Supported models: Llama2-7B, Llama3-8B, Qwen2-1.5B, MiniCPM-1B, MiniCPM-2B, Baichuan2-7B
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Recommended NPU Driver Version for LNL Users
 | 
					 | 
				
			||||||
#### 32.0.100.2625
 | 
					 | 
				
			||||||
Supported models: Llama2-7B, MiniCPM-1B, Baichuan2-7B
 | 
					 | 
				
			||||||
#### 32.0.101.2715
 | 
					 | 
				
			||||||
Supported models: Llama3-8B, MiniCPM-2B, Qwen2-1.5B, Qwen2.5-7B
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Run
 | 
					### Run
 | 
				
			||||||
```cmd
 | 
					```cmd
 | 
				
			||||||
:: to run Llama-2-7b-chat-hf
 | 
					:: to run Llama-2-7b-chat-hf
 | 
				
			||||||
python llama.py
 | 
					python llama2.py --repo-id-or-model-path meta-llama/Llama-2-7b-chat-hf --save-directory <converted_model_path>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
:: to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715)
 | 
					:: to run Meta-Llama-3-8B-Instruct
 | 
				
			||||||
python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct
 | 
					python llama3.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --save-directory <converted_model_path>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
:: to run Llama-3.2-1B-Instruct
 | 
					:: to run Llama-3.2-1B-Instruct
 | 
				
			||||||
python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-1B-Instruct
 | 
					python llama3.py --repo-id-or-model-path meta-llama/Llama-3.2-1B-Instruct --save-directory <converted_model_path>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
:: to run Llama-3.2-3B-Instruct
 | 
					:: to run Llama-3.2-3B-Instruct
 | 
				
			||||||
python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct
 | 
					python llama3.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct --save-directory <converted_model_path>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
:: to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715)
 | 
					:: to run Qwen2-1.5B-Instruct
 | 
				
			||||||
python qwen.py
 | 
					python qwen.py --repo-id-or-model-path Qwen/Qwen2-1.5B-Instruct --low_bit sym_int8 --save-directory <converted_model_path>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
:: to run Qwen2.5-3B-Instruct (LNL driver version: 32.0.101.2715)
 | 
					:: to run Qwen2.5-3B-Instruct
 | 
				
			||||||
python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8
 | 
					python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8 --save-directory <converted_model_path>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
:: to run Qwen2.5-7B-Instruct (LNL driver version: 32.0.101.2715)
 | 
					:: to run Qwen2.5-7B-Instruct
 | 
				
			||||||
python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct
 | 
					python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory <converted_model_path>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
:: to run MiniCPM-1B-sft-bf16
 | 
					:: to run MiniCPM-1B-sft-bf16
 | 
				
			||||||
python minicpm.py
 | 
					python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-1B-sft-bf16 --save-directory <converted_model_path>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
:: to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715)
 | 
					:: to run MiniCPM-2B-sft-bf16
 | 
				
			||||||
python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16
 | 
					python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --save-directory <converted_model_path>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
:: to run Baichuan2-7B-Chat
 | 
					:: to run Baichuan2-7B-Chat
 | 
				
			||||||
python baichuan2.py
 | 
					python baichuan2.py
 | 
				
			||||||
| 
						 | 
					@ -147,6 +137,7 @@ Arguments info:
 | 
				
			||||||
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 | 
					- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 | 
				
			||||||
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 | 
					- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 | 
				
			||||||
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 | 
					- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 | 
				
			||||||
 | 
					- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Troubleshooting
 | 
					### Troubleshooting
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -154,40 +145,10 @@ Arguments info:
 | 
				
			||||||
If you encounter `TypeError: can't convert meta device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.` error when loading lowbit model, please try re-saving the lowbit model with the example script you are currently using. Please note that lowbit models saved by `qwen.py`, `llama.py`, etc. cannot be loaded by `generate.py`.
 | 
					If you encounter `TypeError: can't convert meta device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.` error when loading lowbit model, please try re-saving the lowbit model with the example script you are currently using. Please note that lowbit models saved by `qwen.py`, `llama.py`, etc. cannot be loaded by `generate.py`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#### Output Problem
 | 
					#### Output Problem
 | 
				
			||||||
If you encounter output problem, please try to disable the optimization of transposing value cache with following command:
 | 
					If you encounter output problem, please try to disable the optimization of transposing value cache such as the following command:
 | 
				
			||||||
```cmd
 | 
					```cmd
 | 
				
			||||||
:: to run Llama-2-7b-chat-hf
 | 
					:: to run Llama-2-7b-chat-hf
 | 
				
			||||||
python llama.py --disable-transpose-value-cache
 | 
					python llama2.py --save-directory <converted_model_path> --disable-transpose-value-cache
 | 
				
			||||||
 | 
					 | 
				
			||||||
:: to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715)
 | 
					 | 
				
			||||||
python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --disable-transpose-value-cache
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
:: to run Llama-3.2-1B-Instruct
 | 
					 | 
				
			||||||
python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-1B-Instruct --disable-transpose-value-cache
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
:: to run Llama-3.2-3B-Instruct
 | 
					 | 
				
			||||||
python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct --disable-transpose-value-cache
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
:: to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715)
 | 
					 | 
				
			||||||
python qwen.py --disable-transpose-value-cache
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
:: to run Qwen2.5-7B-Instruct LNL driver version: 32.0.101.2715)
 | 
					 | 
				
			||||||
python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --disable-transpose-value-cache
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
:: to run MiniCPM-1B-sft-bf16
 | 
					 | 
				
			||||||
python minicpm.py --disable-transpose-value-cache
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
:: to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715)
 | 
					 | 
				
			||||||
python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --disable-transpose-value-cache
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
:: to run Baichuan2-7B-Chat
 | 
					 | 
				
			||||||
python baichuan2.py --disable-transpose-value-cache
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
For [Qwen2.5-7B](./qwen.py), you could also try to enable mixed precision optimization when encountering output problems:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```cmd
 | 
					 | 
				
			||||||
python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --mixed-precision
 | 
					 | 
				
			||||||
``` 
 | 
					``` 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#### Better Performance with High CPU Utilization
 | 
					#### Better Performance with High CPU Utilization
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -62,8 +62,12 @@ if __name__ == "__main__":
 | 
				
			||||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
					    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
				
			||||||
    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
					    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
				
			||||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
					    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
				
			||||||
    parser.add_argument("--intra-pp", type=int, default=2)
 | 
					    parser.add_argument("--save-directory", type=str,
 | 
				
			||||||
    parser.add_argument("--inter-pp", type=int, default=2)
 | 
					        required=True,
 | 
				
			||||||
 | 
					        help="The path of folder to save converted model, "
 | 
				
			||||||
 | 
					             "If path not exists, lowbit model will be saved there. "
 | 
				
			||||||
 | 
					             "Else, program will raise error.",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    args = parser.parse_args()
 | 
					    args = parser.parse_args()
 | 
				
			||||||
    model_path = args.repo_id_or_model_path
 | 
					    model_path = args.repo_id_or_model_path
 | 
				
			||||||
| 
						 | 
					@ -78,9 +82,8 @@ if __name__ == "__main__":
 | 
				
			||||||
            optimize_model=True,
 | 
					            optimize_model=True,
 | 
				
			||||||
            max_context_len=args.max_context_len,
 | 
					            max_context_len=args.max_context_len,
 | 
				
			||||||
            max_prompt_len=args.max_prompt_len,
 | 
					            max_prompt_len=args.max_prompt_len,
 | 
				
			||||||
            intra_pp=args.intra_pp,
 | 
					 | 
				
			||||||
            inter_pp=args.inter_pp,
 | 
					 | 
				
			||||||
            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
					            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
				
			||||||
 | 
					            save_directory=args.save_directory
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        model = AutoModelForCausalLM.load_low_bit(
 | 
					        model = AutoModelForCausalLM.load_low_bit(
 | 
				
			||||||
							
								
								
									
										133
									
								
								python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										133
									
								
								python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,133 @@
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Copyright 2016 The BigDL Authors.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Licensed under the Apache License, Version 2.0 (the "License");
 | 
				
			||||||
 | 
					# you may not use this file except in compliance with the License.
 | 
				
			||||||
 | 
					# You may obtain a copy of the License at
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					#     http://www.apache.org/licenses/LICENSE-2.0
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Unless required by applicable law or agreed to in writing, software
 | 
				
			||||||
 | 
					# distributed under the License is distributed on an "AS IS" BASIS,
 | 
				
			||||||
 | 
					# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
				
			||||||
 | 
					# See the License for the specific language governing permissions and
 | 
				
			||||||
 | 
					# limitations under the License.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import torch
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
 | 
					import argparse
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ipex_llm.transformers.npu_model import AutoModelForCausalLM
 | 
				
			||||||
 | 
					from transformers import AutoTokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from transformers.utils import logging
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					logger = logging.get_logger(__name__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
 | 
				
			||||||
 | 
					               system_prompt: str) -> str:
 | 
				
			||||||
 | 
					    prompt_texts = [f'<|begin_of_text|>']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if system_prompt != '':
 | 
				
			||||||
 | 
					        prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for history_input, history_response in chat_history:
 | 
				
			||||||
 | 
					        prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
 | 
				
			||||||
 | 
					        prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
 | 
				
			||||||
 | 
					    return ''.join(prompt_texts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    parser = argparse.ArgumentParser(
 | 
				
			||||||
 | 
					        description="Predict Tokens using `generate()` API for npu model"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--repo-id-or-model-path",
 | 
				
			||||||
 | 
					        type=str,
 | 
				
			||||||
 | 
					        default="meta-llama/Meta-Llama-3-8B-Instruct",
 | 
				
			||||||
 | 
					        help="The huggingface repo id for the Llama3 model to be downloaded"
 | 
				
			||||||
 | 
					        ", or the path to the huggingface checkpoint folder",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument("--lowbit-path", type=str,
 | 
				
			||||||
 | 
					        default="",
 | 
				
			||||||
 | 
					        help="The path to the lowbit model folder, leave blank if you do not want to save. \
 | 
				
			||||||
 | 
					            If path not exists, lowbit model will be saved there. \
 | 
				
			||||||
 | 
					            Else, lowbit model will be loaded.",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
				
			||||||
 | 
					                        help='Prompt to infer')
 | 
				
			||||||
 | 
					    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
				
			||||||
 | 
					    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
				
			||||||
 | 
					    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
				
			||||||
 | 
					    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
				
			||||||
 | 
					    parser.add_argument("--save-directory", type=str,
 | 
				
			||||||
 | 
					        required=True,
 | 
				
			||||||
 | 
					        help="The path of folder to save converted model, "
 | 
				
			||||||
 | 
					             "If path not exists, lowbit model will be saved there. "
 | 
				
			||||||
 | 
					             "Else, program will raise error.",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    args = parser.parse_args()
 | 
				
			||||||
 | 
					    model_path = args.repo_id_or_model_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
 | 
				
			||||||
 | 
					        model = AutoModelForCausalLM.from_pretrained(
 | 
				
			||||||
 | 
					            model_path,
 | 
				
			||||||
 | 
					            torch_dtype=torch.float16,
 | 
				
			||||||
 | 
					            trust_remote_code=True,
 | 
				
			||||||
 | 
					            attn_implementation="eager",
 | 
				
			||||||
 | 
					            load_in_low_bit="sym_int4",
 | 
				
			||||||
 | 
					            optimize_model=True,
 | 
				
			||||||
 | 
					            max_context_len=args.max_context_len,
 | 
				
			||||||
 | 
					            max_prompt_len=args.max_prompt_len,
 | 
				
			||||||
 | 
					            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
				
			||||||
 | 
					            save_directory=args.save_directory
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        model = AutoModelForCausalLM.load_low_bit(
 | 
				
			||||||
 | 
					            args.lowbit_path,
 | 
				
			||||||
 | 
					            attn_implementation="eager",
 | 
				
			||||||
 | 
					            torch_dtype=torch.float16,
 | 
				
			||||||
 | 
					            optimize_model=True,
 | 
				
			||||||
 | 
					            max_context_len=args.max_context_len,
 | 
				
			||||||
 | 
					            max_prompt_len=args.max_prompt_len,
 | 
				
			||||||
 | 
					            intra_pp=args.intra_pp,
 | 
				
			||||||
 | 
					            inter_pp=args.inter_pp,
 | 
				
			||||||
 | 
					            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if args.lowbit_path and not os.path.exists(args.lowbit_path):
 | 
				
			||||||
 | 
					        model.save_low_bit(args.lowbit_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    DEFAULT_SYSTEM_PROMPT = """\
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print("-" * 80)
 | 
				
			||||||
 | 
					    print("done")
 | 
				
			||||||
 | 
					    with torch.inference_mode():
 | 
				
			||||||
 | 
					        print("finish to load")
 | 
				
			||||||
 | 
					        for i in range(5):
 | 
				
			||||||
 | 
					            prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
 | 
				
			||||||
 | 
					            _input_ids = tokenizer.encode(prompt, return_tensors="pt")
 | 
				
			||||||
 | 
					            print("input length:", len(_input_ids[0]))
 | 
				
			||||||
 | 
					            st = time.time()
 | 
				
			||||||
 | 
					            output = model.generate(
 | 
				
			||||||
 | 
					                _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            end = time.time()
 | 
				
			||||||
 | 
					            print(f"Inference time: {end-st} s")
 | 
				
			||||||
 | 
					            input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False)
 | 
				
			||||||
 | 
					            print("-" * 20, "Input", "-" * 20)
 | 
				
			||||||
 | 
					            print(input_str)
 | 
				
			||||||
 | 
					            output_str = tokenizer.decode(output[0], skip_special_tokens=False)
 | 
				
			||||||
 | 
					            print("-" * 20, "Output", "-" * 20)
 | 
				
			||||||
 | 
					            print(output_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print("-" * 80)
 | 
				
			||||||
 | 
					    print("done")
 | 
				
			||||||
 | 
					    print("success shut down")
 | 
				
			||||||
| 
						 | 
					@ -49,8 +49,12 @@ if __name__ == "__main__":
 | 
				
			||||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
					    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
				
			||||||
    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
					    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
				
			||||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
					    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
				
			||||||
    parser.add_argument("--intra-pp", type=int, default=2)
 | 
					    parser.add_argument("--save-directory", type=str,
 | 
				
			||||||
    parser.add_argument("--inter-pp", type=int, default=2)
 | 
					        required=True,
 | 
				
			||||||
 | 
					        help="The path of folder to save converted model, "
 | 
				
			||||||
 | 
					             "If path not exists, lowbit model will be saved there. "
 | 
				
			||||||
 | 
					             "Else, program will raise error.",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    args = parser.parse_args()
 | 
					    args = parser.parse_args()
 | 
				
			||||||
    model_path = args.repo_id_or_model_path
 | 
					    model_path = args.repo_id_or_model_path
 | 
				
			||||||
| 
						 | 
					@ -64,9 +68,8 @@ if __name__ == "__main__":
 | 
				
			||||||
            optimize_model=True,
 | 
					            optimize_model=True,
 | 
				
			||||||
            max_context_len=args.max_context_len,
 | 
					            max_context_len=args.max_context_len,
 | 
				
			||||||
            max_prompt_len=args.max_prompt_len,
 | 
					            max_prompt_len=args.max_prompt_len,
 | 
				
			||||||
            intra_pp=args.intra_pp,
 | 
					 | 
				
			||||||
            inter_pp=args.inter_pp,
 | 
					 | 
				
			||||||
            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
					            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
				
			||||||
 | 
					            save_directory=args.save_directory
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        model = AutoModelForCausalLM.load_low_bit(
 | 
					        model = AutoModelForCausalLM.load_low_bit(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -52,9 +52,6 @@ if __name__ == "__main__":
 | 
				
			||||||
    parser.add_argument('--low_bit', type=str, default="sym_int4",
 | 
					    parser.add_argument('--low_bit', type=str, default="sym_int4",
 | 
				
			||||||
                        help='Load in low bit to use')
 | 
					                        help='Load in low bit to use')
 | 
				
			||||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
					    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
				
			||||||
    parser.add_argument("--intra-pp", type=int, default=None)
 | 
					 | 
				
			||||||
    parser.add_argument("--inter-pp", type=int, default=None)
 | 
					 | 
				
			||||||
    parser.add_argument("--mixed-precision", action='store_false')
 | 
					 | 
				
			||||||
    parser.add_argument("--save-directory", type=str,
 | 
					    parser.add_argument("--save-directory", type=str,
 | 
				
			||||||
        required=True,
 | 
					        required=True,
 | 
				
			||||||
        help="The path of folder to save converted model, "
 | 
					        help="The path of folder to save converted model, "
 | 
				
			||||||
| 
						 | 
					@ -75,10 +72,8 @@ if __name__ == "__main__":
 | 
				
			||||||
            optimize_model=True,
 | 
					            optimize_model=True,
 | 
				
			||||||
            max_context_len=args.max_context_len,
 | 
					            max_context_len=args.max_context_len,
 | 
				
			||||||
            max_prompt_len=args.max_prompt_len,
 | 
					            max_prompt_len=args.max_prompt_len,
 | 
				
			||||||
            intra_pp=args.intra_pp,
 | 
					 | 
				
			||||||
            inter_pp=args.inter_pp,
 | 
					 | 
				
			||||||
            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
					            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
				
			||||||
            mixed_precision=args.mixed_precision,
 | 
					            mixed_precision=True,
 | 
				
			||||||
            quantization_group_size=args.quantization_group_size,
 | 
					            quantization_group_size=args.quantization_group_size,
 | 
				
			||||||
            save_directory=args.save_directory
 | 
					            save_directory=args.save_directory
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -266,7 +266,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        model.share_memory()
 | 
					        model.share_memory()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not pipeline:
 | 
					        if not pipeline:
 | 
				
			||||||
            if model.config.model_type in ["qwen2"]:
 | 
					            if model.config.model_type in ["qwen2", "llama", "minicpm"]:
 | 
				
			||||||
                from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
 | 
					                from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
 | 
				
			||||||
                optimize_llm_single_process(
 | 
					                optimize_llm_single_process(
 | 
				
			||||||
                    llm,
 | 
					                    llm,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue