From 2fbbb51e71a562b6065271ac06f88e3018fa2277 Mon Sep 17 00:00:00 2001 From: Jinhe Date: Thu, 15 Aug 2024 15:39:24 +0800 Subject: [PATCH] transformers==4.37, yi & yuan2 & vicuna (#11805) * transformers==4.37 * added yi model * added yi model * xxxx * delete prompt template * / and delete --- .../GPU/HuggingFace/LLM/vicuna/README.md | 12 +++++------ .../GPU/HuggingFace/LLM/vicuna/generate.py | 6 +++--- .../example/GPU/HuggingFace/LLM/yi/README.md | 20 ++++++++++++++----- .../GPU/HuggingFace/LLM/yi/generate.py | 14 +++---------- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/python/llm/example/GPU/HuggingFace/LLM/vicuna/README.md b/python/llm/example/GPU/HuggingFace/LLM/vicuna/README.md index 852c29de..7f4b9806 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/vicuna/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/vicuna/README.md @@ -1,5 +1,5 @@ # Vicuna -In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on Vicuna models. For illustration purposes, we utilize the [lmsys/vicuna-13b-v1.3](https://huggingface.co/lmsys/vicuna-13b-v1.3) and [eachadea/vicuna-7b-1.1](https://huggingface.co/eachadea/vicuna-7b-1.1) as reference Vicuna models. +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on Vicuna models. For illustration purposes, we utilize the [lmsys/vicuna-13b-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5) and [lmsys/vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) as reference Vicuna models. ## 0. Requirements To run these examples with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. @@ -109,7 +109,7 @@ python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROM ``` Arguments info: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Vicuna model (e.g. `lmsys/vicuna-13b-v1.3` and `eachadea/vicuna-7b-1.1`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'lmsys/vicuna-13b-v1.3'`. +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Vicuna model (e.g. `lmsys/vicuna-13b-v1.5` and `eachadea/vicuna-7b-v1.5`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'lmsys/vicuna-13b-v1.5'`. - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. @@ -118,7 +118,7 @@ Arguments info: > Please select the appropriate size of the Vicuna model based on the capabilities of your machine. #### Sample Output -#### [lmsys/vicuna-13b-v1.3](https://huggingface.co/lmsys/vicuna-13b-v1.3) +#### [lmsys/vicuna-13b-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5) ```log Inference time: xxxx s -------------------- Prompt -------------------- @@ -130,10 +130,10 @@ What is AI? ### Human: What is AI? ### Assistant: -AI, or Artificial Intelligence, refers to the development of computer systems that can perform tasks that typically require human intelligence, such as visual perception, +AI stands for Artificial Intelligence. It refers to the development of computer systems that can perform tasks that typically require human intelligence, such as visual perception ``` -#### [eachadea/vicuna-7b-1.1](https://huggingface.co/eachadea/vicuna-7b-1.1) +#### [eachadea/vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) ```log Inference time: xxxx s -------------------- Prompt -------------------- @@ -145,5 +145,5 @@ What is AI? ### Human: What is AI? ### Assistant: -AI, or artificial intelligence, refers to the ability of a machine or computer program to mimic human intelligence and perform tasks that would normally require human intelligence to +AI stands for "Artificial Intelligence." It refers to the development of computer systems that can perform tasks that typically require human intelligence, such as visual per ``` diff --git a/python/llm/example/GPU/HuggingFace/LLM/vicuna/generate.py b/python/llm/example/GPU/HuggingFace/LLM/vicuna/generate.py index 1cf63a2c..e473a2dc 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/vicuna/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/vicuna/generate.py @@ -27,8 +27,8 @@ Vicuna_PROMPT_FORMAT = "### Human:\n{prompt} \n ### Assistant:\n" if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Vicuna model') - parser.add_argument('--repo-id-or-model-path', type=str, default="lmsys/vicuna-13b-v1.3", - help='The huggingface repo id for the Vicuna (e.g. `lmsys/vicuna-13b-v1.3` and `eachadea/vicuna-7b-1.1`) to be downloaded' + parser.add_argument('--repo-id-or-model-path', type=str, default="lmsys/vicuna-13b-v1.5", + help='The huggingface repo id for the Vicuna (e.g. `lmsys/vicuna-13b-v1.5` and `lmsys/vicuna-7b-v1.5`) to be downloaded' ', or the path to the huggingface checkpoint folder') parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') @@ -57,7 +57,7 @@ if __name__ == '__main__': # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; # to obtain optimal performance with IPEX-LLM INT4 optimizations, - # it is important to set use_cache=True for vicuna-v1.3 models + # it is important to set use_cache=True for vicuna-v1.5 models output = model.generate(input_ids, use_cache=True, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/HuggingFace/LLM/yi/README.md b/python/llm/example/GPU/HuggingFace/LLM/yi/README.md index 1cda8888..1fb49f21 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/yi/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/yi/README.md @@ -1,5 +1,5 @@ # Yi -In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on Yi models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) as a reference Yi model. +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on Yi models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) and [01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-1.5-6B-Chat) as reference Yi models. ## 0. Requirements To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. @@ -112,7 +112,7 @@ python ./generate.py In the example, several arguments can be passed to satisfy your requirements: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Yi model (e.g. `01-ai/Yi-6B`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'01-ai/Yi-6B'`. +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Yi model (e.g. `01-ai/Yi-6B` and `01-ai/Yi-6B-Chat`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'01-ai/Yi-6B-Chat'`. - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'AI是什么?'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. @@ -122,8 +122,18 @@ In the example, several arguments can be passed to satisfy your requirements: ```log Inference time: xxxx s -------------------- Prompt -------------------- -AI是什么? +What is AI? -------------------- Output -------------------- -AI是什么? -人工智能(Artificial Intelligence),英文缩写为AI。它是研究、开发用于模拟、延伸和扩展人的智能的理论、方法、技术及 +What is AI? +Artificial Intelligence (AI) is the simulation of human intelligence in machines. AI is the science and engineering of making intelligent machines, especially intelligent computer programs. ``` + +#### [01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-6B-Chat) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +What is AI? +-------------------- Output -------------------- +What is AI? +Artificial Intelligence (AI) refers to the simulation of human intelligence processes by machines, especially computer systems. These processes include learning, reasoning, and self- +``` \ No newline at end of file diff --git a/python/llm/example/GPU/HuggingFace/LLM/yi/generate.py b/python/llm/example/GPU/HuggingFace/LLM/yi/generate.py index f9a0e544..f32f272c 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/yi/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/yi/generate.py @@ -21,21 +21,13 @@ import argparse from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer -# Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model -YI_PROMPT_FORMAT = """ -<|im_start|>system -You are a helpful assistant. If you don't understand what the user means, ask the user to provide more information.<|im_end|> -<|im_start|>user -{prompt}<|im_end|> -<|im_start|>assistant -""" if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Yi model') - parser.add_argument('--repo-id-or-model-path', type=str, default="01-ai/Yi-6B", + parser.add_argument('--repo-id-or-model-path', type=str, default="01-ai/Yi-6B-Chat", help='The huggingface repo id for the Yi model to be downloaded' ', or the path to the huggingface checkpoint folder') - parser.add_argument('--prompt', type=str, default="AI是什么?", + parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument('--n-predict', type=int, default=32, help='Max tokens to predict') @@ -60,7 +52,7 @@ if __name__ == '__main__': # Generate predicted tokens with torch.inference_mode(): - prompt = YI_PROMPT_FORMAT.format(prompt=args.prompt) + prompt = args.prompt input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids,