Update npu example and all in one benckmark (#11766)
This commit is contained in:
parent
57d177738d
commit
05989ad0f9
3 changed files with 10 additions and 8 deletions
|
|
@ -580,15 +580,16 @@ def transformers_int4_npu_win(repo_id,
|
|||
# which convert the relevant layers in the model into INT4 format
|
||||
st = time.perf_counter()
|
||||
if repo_id in CHATGLM_IDS:
|
||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype='auto').eval()
|
||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
|
||||
torch_dtype='auto', attn_implementation="eager").eval()
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
elif repo_id in LLAMA_IDS:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
|
||||
use_cache=True).eval()
|
||||
use_cache=True, attn_implementation="eager").eval()
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
|
||||
use_cache=True).eval()
|
||||
use_cache=True, attn_implementation="eager").eval()
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
end = time.perf_counter()
|
||||
load_time = end - st
|
||||
|
|
|
|||
|
|
@ -29,11 +29,11 @@ In the example [generate.py](./generate.py), we show a basic use case for a Llam
|
|||
#### 1.1 Installation on Windows
|
||||
We suggest using conda to manage environment:
|
||||
```bash
|
||||
conda create -n llm python=3.10 libuv
|
||||
conda create -n llm python=3.10
|
||||
conda activate llm
|
||||
|
||||
# below command will install intel_extension_for_pytorch==2.1.10+xpu as default
|
||||
pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||
# install ipex-llm with 'all' option
|
||||
pip install --pre --upgrade ipex-llm[all]
|
||||
|
||||
# below command will install intel_npu_acceleration_library
|
||||
pip install intel-npu-acceleration-library==1.3
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ from transformers import AutoTokenizer
|
|||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for npu model')
|
||||
parser.add_argument('--repo-id-or-model-path', type=str, default="D:\llm-models\Llama-2-7b-chat-hf",
|
||||
parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
|
||||
help='The huggingface repo id for the Llama2 model to be downloaded'
|
||||
', or the path to the huggingface checkpoint folder')
|
||||
parser.add_argument('--prompt', type=str, default="Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun",
|
||||
|
|
@ -40,7 +40,8 @@ if __name__ == '__main__':
|
|||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True,
|
||||
load_in_low_bit=args.load_in_low_bit)
|
||||
load_in_low_bit=args.load_in_low_bit,
|
||||
attn_implementation="eager")
|
||||
|
||||
print(model)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue