From ec362e61339a142c8ac31eec37067e28f7941281 Mon Sep 17 00:00:00 2001 From: binbin Deng <108676127+plusbang@users.noreply.github.com> Date: Mon, 28 Oct 2024 09:24:51 +0800 Subject: [PATCH] Add llama3 level0 example (#12275) --- .../LLM/Pipeline-Models/README.md | 77 +++++++++++++++ .../Pipeline-Models/{llama.py => llama2.py} | 0 .../LLM/Pipeline-Models/llama3.py | 97 +++++++++++++++++++ 3 files changed, 174 insertions(+) create mode 100644 python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md rename python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/{llama.py => llama2.py} (100%) create mode 100644 python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md new file mode 100644 index 00000000..fcb79e5a --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md @@ -0,0 +1,77 @@ +# Run HuggingFace `transformers` Models with Pipeline Optimization on Intel NPU + +In this directory, you will find examples on how to directly run HuggingFace `transformers` models with pipeline optimization on Intel NPUs. See the table blow for verified models. + +## Verified Models + +| Model | Model Link | +|------------|----------------------------------------------------------------| +| Llama2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | +| Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | + +## 0. Requirements +To run these examples with IPEX-LLM on Intel NPUs, make sure to install the newest driver version of Intel NPU. +Go to https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html to download and unzip the driver. +Then go to **Device Manager**, find **Neural Processors** -> **Intel(R) AI Boost**. +Right click and select **Update Driver** -> **Browse my computer for drivers**. And then manually select the unzipped driver folder to install. + +## 1. Install +### 1.1 Installation on Windows +We suggest using conda to manage environment: +```cmd +conda create -n llm python=3.10 +conda activate llm + +:: install ipex-llm with 'npu' option +pip install --pre --upgrade ipex-llm[npu] +``` + +## 2. Runtime Configurations + +**Following envrionment variables are required**: + +```cmd +set BIGDL_USE_NPU=1 +``` + +## 3. Run Models +In the example [generate.py](./generate.py), we show a basic use case for a Llama2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel NPUs. + +```cmd +:: to run Llama-2-7b-chat-hf +python llama2.py + +:: to run Meta-Llama-3-8B-Instruct +python llama3.py +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. +- `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. +- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. + +### Sample Output +#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) + +```log + Number of input tokens: 28 + Generated tokens: 32 + First token generation time: xxxx s + Generation average latency: xxxx ms, (xxxx token/s) + Generation time: xxxx s + +Inference time: xxxx s +-------------------- Input -------------------- + [INST] <> + +<> + +What is AI? [/INST] +-------------------- Output -------------------- + [INST] <> + +<> + +What is AI? [/INST] AI (Artificial Intelligence) is a field of computer science and technology that focuses on the development of intelligent machines that can perform +``` diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py similarity index 100% rename from python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py rename to python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py new file mode 100644 index 00000000..801efa10 --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py @@ -0,0 +1,97 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import torch +import time +import argparse +from ipex_llm.transformers.npu_model import AutoModelForCausalLM +from transformers import AutoTokenizer +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +# you could tune the prompt based on your own model, +# here the prompt tuning refers to https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3 +DEFAULT_SYSTEM_PROMPT = """\ +""" + +def get_prompt(user_input: str, chat_history: list[tuple[str, str]], + system_prompt: str) -> str: + prompt_texts = [f'<|begin_of_text|>'] + + if system_prompt != '': + prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>') + + for history_input, history_response in chat_history: + prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>') + prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>') + + prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n') + return ''.join(prompt_texts) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Predict Tokens using `generate()` API for npu model" + ) + parser.add_argument( + "--repo-id-or-model-path", + type=str, + default="meta-llama/Meta-Llama-3-8B-Instruct", + help="The huggingface repo id for the Llama3 model to be downloaded" + ", or the path to the huggingface checkpoint folder", + ) + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--max-output-len", type=int, default=1024) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + model = AutoModelForCausalLM.from_pretrained(model_path, + torch_dtype=torch.float16, + optimize_model=True, + pipeline=True, + max_output_len=args.max_output_len, + attn_implementation="eager") + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + print("-" * 80) + print("done") + with torch.inference_mode(): + print("finish to load") + for i in range(5): + prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT) + _input_ids = tokenizer.encode(prompt, return_tensors="pt") + print("input length:", len(_input_ids[0])) + st = time.time() + output = model.generate( + _input_ids, max_new_tokens=args.n_predict, do_print=True + ) + end = time.time() + print(f"Inference time: {end-st} s") + input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) + print("-" * 20, "Input", "-" * 20) + print(input_str) + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print("-" * 20, "Output", "-" * 20) + print(output_str) + + print("-" * 80) + print("done") + print("success shut down")