Update npu example and all in one benckmark (#11766)
This commit is contained in:
		
							parent
							
								
									57d177738d
								
							
						
					
					
						commit
						05989ad0f9
					
				
					 3 changed files with 10 additions and 8 deletions
				
			
		| 
						 | 
					@ -580,15 +580,16 @@ def transformers_int4_npu_win(repo_id,
 | 
				
			||||||
    # which convert the relevant layers in the model into INT4 format
 | 
					    # which convert the relevant layers in the model into INT4 format
 | 
				
			||||||
    st = time.perf_counter()
 | 
					    st = time.perf_counter()
 | 
				
			||||||
    if repo_id in CHATGLM_IDS:
 | 
					    if repo_id in CHATGLM_IDS:
 | 
				
			||||||
        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype='auto').eval()
 | 
					        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
 | 
				
			||||||
 | 
					                                          torch_dtype='auto', attn_implementation="eager").eval()
 | 
				
			||||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
    elif repo_id in LLAMA_IDS:
 | 
					    elif repo_id in LLAMA_IDS:
 | 
				
			||||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
 | 
				
			||||||
                                                     use_cache=True).eval()
 | 
					                                                     use_cache=True, attn_implementation="eager").eval()
 | 
				
			||||||
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
 | 
				
			||||||
                                                     use_cache=True).eval()
 | 
					                                                     use_cache=True, attn_implementation="eager").eval()
 | 
				
			||||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
    end = time.perf_counter()
 | 
					    end = time.perf_counter()
 | 
				
			||||||
    load_time = end - st
 | 
					    load_time = end - st
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -29,11 +29,11 @@ In the example [generate.py](./generate.py), we show a basic use case for a Llam
 | 
				
			||||||
#### 1.1 Installation on Windows
 | 
					#### 1.1 Installation on Windows
 | 
				
			||||||
We suggest using conda to manage environment:
 | 
					We suggest using conda to manage environment:
 | 
				
			||||||
```bash
 | 
					```bash
 | 
				
			||||||
conda create -n llm python=3.10 libuv
 | 
					conda create -n llm python=3.10
 | 
				
			||||||
conda activate llm
 | 
					conda activate llm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 | 
					# install ipex-llm with 'all' option
 | 
				
			||||||
pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 | 
					pip install --pre --upgrade ipex-llm[all]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# below command will install intel_npu_acceleration_library
 | 
					# below command will install intel_npu_acceleration_library
 | 
				
			||||||
pip install intel-npu-acceleration-library==1.3
 | 
					pip install intel-npu-acceleration-library==1.3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -24,7 +24,7 @@ from transformers import AutoTokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
    parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for npu model')
 | 
					    parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for npu model')
 | 
				
			||||||
    parser.add_argument('--repo-id-or-model-path', type=str, default="D:\llm-models\Llama-2-7b-chat-hf",
 | 
					    parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
 | 
				
			||||||
                        help='The huggingface repo id for the Llama2 model to be downloaded'
 | 
					                        help='The huggingface repo id for the Llama2 model to be downloaded'
 | 
				
			||||||
                             ', or the path to the huggingface checkpoint folder')
 | 
					                             ', or the path to the huggingface checkpoint folder')
 | 
				
			||||||
    parser.add_argument('--prompt', type=str, default="Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun",
 | 
					    parser.add_argument('--prompt', type=str, default="Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun",
 | 
				
			||||||
| 
						 | 
					@ -40,7 +40,8 @@ if __name__ == '__main__':
 | 
				
			||||||
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True,
 | 
					    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True,
 | 
				
			||||||
                                                 load_in_low_bit=args.load_in_low_bit)
 | 
					                                                 load_in_low_bit=args.load_in_low_bit,
 | 
				
			||||||
 | 
					                                                 attn_implementation="eager")
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    print(model)
 | 
					    print(model)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue