From 8c5c7f32ddd286b4921bfe5d859e77026c743ccb Mon Sep 17 00:00:00 2001 From: SONG Ge <38711238+sgwhat@users.noreply.github.com> Date: Wed, 21 Aug 2024 13:45:29 +0800 Subject: [PATCH] Update doc for running npu generate example with ipex-llm[npu] (#11876) * update doc for running npu generate example with ipex-llm[npu] * switch max_prompt_len to 512 to fix compile error on mtl --- .../NPU/HF-Transformers-AutoModels/LLM/README.md | 11 +++-------- .../NPU/HF-Transformers-AutoModels/LLM/llama2.py | 2 +- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index 728617f0..6d8fa6f8 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -32,13 +32,8 @@ We suggest using conda to manage environment: conda create -n llm python=3.10 conda activate llm -# install ipex-llm with 'all' option -pip install --pre --upgrade ipex-llm[all] - -# below command will install intel_npu_acceleration_library -pip install intel-npu-acceleration-library==1.3 - -pip install transformers==4.40 +# install ipex-llm with 'npu' option +pip install --pre --upgrade ipex-llm[npu] ``` ### 2. Runtime Configurations @@ -124,7 +119,7 @@ Arguments info: - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. - `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. -- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `768`. +- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. #### Sample Output diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py index d23a6405..b9c63c8c 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py @@ -54,7 +54,7 @@ if __name__ == "__main__": help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") parser.add_argument("--max-output-len", type=int, default=1024) - parser.add_argument("--max-prompt-len", type=int, default=768) + parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--intra-pp", type=int, default=2) parser.add_argument("--inter-pp", type=int, default=2)