From 05989ad0f944efe8a5f2f73a836106d4c51c92d4 Mon Sep 17 00:00:00 2001 From: "Jin, Qiao" <89779290+JinBridger@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:46:46 +0800 Subject: [PATCH] Update npu example and all in one benckmark (#11766) --- python/llm/dev/benchmark/all-in-one/run.py | 7 ++++--- .../example/NPU/HF-Transformers-AutoModels/LLM/README.md | 6 +++--- .../example/NPU/HF-Transformers-AutoModels/LLM/generate.py | 5 +++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 9d9f16cf..c5ab2273 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -580,15 +580,16 @@ def transformers_int4_npu_win(repo_id, # which convert the relevant layers in the model into INT4 format st = time.perf_counter() if repo_id in CHATGLM_IDS: - model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype='auto').eval() + model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, + torch_dtype='auto', attn_implementation="eager").eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) elif repo_id in LLAMA_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, - use_cache=True).eval() + use_cache=True, attn_implementation="eager").eval() tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, - use_cache=True).eval() + use_cache=True, attn_implementation="eager").eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) end = time.perf_counter() load_time = end - st diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index 9e75a374..4f84662e 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -29,11 +29,11 @@ In the example [generate.py](./generate.py), we show a basic use case for a Llam #### 1.1 Installation on Windows We suggest using conda to manage environment: ```bash -conda create -n llm python=3.10 libuv +conda create -n llm python=3.10 conda activate llm -# below command will install intel_extension_for_pytorch==2.1.10+xpu as default -pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +# install ipex-llm with 'all' option +pip install --pre --upgrade ipex-llm[all] # below command will install intel_npu_acceleration_library pip install intel-npu-acceleration-library==1.3 diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py index 4a9a25ef..a3536ccc 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py @@ -24,7 +24,7 @@ from transformers import AutoTokenizer if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for npu model') - parser.add_argument('--repo-id-or-model-path', type=str, default="D:\llm-models\Llama-2-7b-chat-hf", + parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf", help='The huggingface repo id for the Llama2 model to be downloaded' ', or the path to the huggingface checkpoint folder') parser.add_argument('--prompt', type=str, default="Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun", @@ -40,7 +40,8 @@ if __name__ == '__main__': tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, - load_in_low_bit=args.load_in_low_bit) + load_in_low_bit=args.load_in_low_bit, + attn_implementation="eager") print(model)