From 52a2135d83707899e2e4eddbf1b81cdd140876c8 Mon Sep 17 00:00:00 2001 From: ZehuaCao <47251317+Romanticoseu@users.noreply.github.com> Date: Thu, 28 Mar 2024 13:54:40 +0800 Subject: [PATCH] Replace ipex with ipex-llm (#10554) * fix ipex with ipex_llm * fix ipex with ipex_llm * update * update * update * update * update * update * update * update --- docker/llm/README.md | 2 +- .../llm/finetune/qlora/cpu/kubernetes/Chart.yaml | 2 +- docker/llm/serving/cpu/docker/README.md | 2 +- docker/llm/serving/cpu/kubernetes/README.md | 2 +- .../CPU/Deepspeed-AutoTP/deepspeed_autotp.py | 2 +- python/llm/example/CPU/LangChain/README.md | 11 ++++++++--- .../CPU/PyTorch-Models/Model/mixtral/generate.py | 2 +- .../CPU/QLoRA-FineTuning/alpaca-qlora/README.md | 4 ++-- .../finetune_one_node_two_sockets.sh | 2 +- .../GPU/Deepspeed-AutoTP/deepspeed_autotp.py | 2 +- .../Advanced-Quantizations/GGUF-IQ2/generate.py | 2 +- .../Model/baichuan/generate.py | 2 +- .../Model/baichuan2/generate.py | 2 +- .../Model/bluelm/generate.py | 2 +- .../Model/chatglm2/generate.py | 2 +- .../Model/chatglm2/streamchat.py | 2 +- .../Model/chatglm3/generate.py | 2 +- .../Model/chatglm3/streamchat.py | 2 +- .../Model/chinese-llama2/generate.py | 2 +- .../Model/codellama/generate.py | 2 +- .../Model/falcon/generate.py | 2 +- .../Model/flan-t5/generate.py | 2 +- .../Model/gemma/generate.py | 2 +- .../Model/gpt-j/generate.py | 2 +- .../Model/internlm/generate.py | 2 +- .../Model/internlm2/generate.py | 2 +- .../Model/llama2/generate.py | 2 +- .../Model/mistral/generate.py | 2 +- .../Model/mixtral/generate.py | 2 +- .../Model/mpt/generate.py | 2 +- .../Model/phi-1_5/generate.py | 2 +- .../Model/phi-2/generate.py | 2 +- .../Model/phixtral/generate.py | 2 +- .../Model/qwen/generate.py | 2 +- .../Model/redpajama/generate.py | 2 +- .../Model/replit/generate.py | 2 +- .../Model/rwkv4/generate.py | 2 +- .../Model/rwkv5/generate.py | 2 +- .../Model/solar/generate.py | 2 +- .../Model/starcoder/generate.py | 2 +- .../Model/yi/generate.py | 2 +- .../Save-Load/generate.py | 2 +- .../example/GPU/LLM-Finetuning/LoRA/README.md | 4 ++-- .../LoRA/lora_finetune_llama2_7b_arc_1_card.sh | 2 +- .../lora_finetune_llama2_7b_pvc_1110_4_card.sh | 2 +- .../lora_finetune_llama2_7b_pvc_1550_1_tile.sh | 2 +- .../lora_finetune_llama2_7b_pvc_1550_4_card.sh | 2 +- .../example/GPU/LLM-Finetuning/QA-LoRA/README.md | 4 ++-- .../qalora_finetune_llama2_7b_arc_1_card.sh | 2 +- .../qalora_finetune_llama2_7b_arc_2_card.sh | 2 +- .../qalora_finetune_llama2_7b_pvc_1550_1_card.sh | 2 +- .../qalora_finetune_llama2_7b_pvc_1550_1_tile.sh | 2 +- .../LLM-Finetuning/QLoRA/alpaca-qlora/README.md | 4 ++-- .../qlora_finetune_llama2_13b_pvc_1550_1_card.sh | 2 +- .../qlora_finetune_llama2_13b_pvc_1550_1_tile.sh | 2 +- .../qlora_finetune_llama2_13b_pvc_1550_4_card.sh | 2 +- .../qlora_finetune_llama2_70b_pvc_1550_1_card.sh | 2 +- .../qlora_finetune_llama2_70b_pvc_1550_4_card.sh | 2 +- .../qlora_finetune_llama2_7b_arc_1_card.sh | 2 +- .../qlora_finetune_llama2_7b_arc_2_card.sh | 2 +- .../qlora_finetune_llama2_7b_flex_170_1_card.sh | 2 +- .../qlora_finetune_llama2_7b_flex_170_3_card.sh | 2 +- .../qlora_finetune_llama2_7b_pvc_1100_1_card.sh | 2 +- .../qlora_finetune_llama2_7b_pvc_1100_4_card.sh | 2 +- .../qlora_finetune_llama2_7b_pvc_1550_1_card.sh | 2 +- .../qlora_finetune_llama2_7b_pvc_1550_4_card.sh | 2 +- .../example/GPU/LLM-Finetuning/ReLora/README.md | 4 ++-- .../relora_finetune_llama2_7b_arc_1_card.sh | 2 +- .../relora_finetune_llama2_7b_arc_2_card.sh | 2 +- .../relora_finetune_llama2_7b_pvc_1550_1_card.sh | 2 +- .../relora_finetune_llama2_7b_pvc_1550_4_card.sh | 2 +- .../GPU/ModelScope-Models/Save-Load/generate.py | 2 +- .../example/GPU/ModelScope-Models/generate.py | 2 +- .../GPU/Pipeline-Parallel-Inference/generate.py | 2 +- .../GPU/PyTorch-Models/Model/aquila2/generate.py | 2 +- .../PyTorch-Models/Model/baichuan/generate.py | 2 +- .../PyTorch-Models/Model/baichuan2/generate.py | 2 +- .../Model/bark/synthesize_speech.py | 2 +- .../GPU/PyTorch-Models/Model/bluelm/generate.py | 2 +- .../PyTorch-Models/Model/chatglm2/generate.py | 2 +- .../PyTorch-Models/Model/chatglm2/streamchat.py | 2 +- .../PyTorch-Models/Model/chatglm3/generate.py | 2 +- .../PyTorch-Models/Model/chatglm3/streamchat.py | 2 +- .../PyTorch-Models/Model/codellama/generate.py | 2 +- .../PyTorch-Models/Model/dolly-v1/generate.py | 2 +- .../PyTorch-Models/Model/dolly-v2/generate.py | 2 +- .../GPU/PyTorch-Models/Model/flan-t5/generate.py | 2 +- .../PyTorch-Models/Model/internlm2/generate.py | 2 +- .../GPU/PyTorch-Models/Model/llama2/generate.py | 2 +- .../GPU/PyTorch-Models/Model/mamba/generate.py | 2 +- .../GPU/PyTorch-Models/Model/mistral/generate.py | 2 +- .../GPU/PyTorch-Models/Model/mixtral/generate.py | 2 +- .../GPU/PyTorch-Models/Model/phi-1_5/generate.py | 2 +- .../GPU/PyTorch-Models/Model/phi-2/generate.py | 2 +- .../PyTorch-Models/Model/phixtral/generate.py | 2 +- .../GPU/PyTorch-Models/Model/replit/generate.py | 2 +- .../Model/speech-t5/synthesize_speech.py | 2 +- .../PyTorch-Models/Model/starcoder/generate.py | 2 +- .../GPU/PyTorch-Models/Model/yi/generate.py | 2 +- .../PyTorch-Models/More-Data-Types/generate.py | 2 +- .../GPU/PyTorch-Models/Save-Load/generate.py | 2 +- .../llm/src/ipex_llm/serving/fastchat/README.md | 16 ++++++++-------- python/llm/src/ipex_llm/transformers/convert.py | 2 +- .../src/ipex_llm/transformers/low_bit_linear.py | 2 +- .../src/ipex_llm/transformers/models/chatglm2.py | 4 ++-- .../llm/src/ipex_llm/transformers/speculative.py | 4 ++-- 106 files changed, 127 insertions(+), 122 deletions(-) diff --git a/docker/llm/README.md b/docker/llm/README.md index eba61568..8c58287d 100644 --- a/docker/llm/README.md +++ b/docker/llm/README.md @@ -62,7 +62,7 @@ After the container is booted, you could get into the container through `docker docker exec -it my_container bash ``` -To run inference using `IPEX-LLM` using cpu, you could refer to this [documentation](https://github.com/intel-analytics/IPEX/tree/main/python/llm#cpu-int4). +To run inference using `IPEX-LLM` using cpu, you could refer to this [documentation](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm#cpu-int4). #### Getting started with chat diff --git a/docker/llm/finetune/qlora/cpu/kubernetes/Chart.yaml b/docker/llm/finetune/qlora/cpu/kubernetes/Chart.yaml index 2c750b4d..b5d12a1d 100644 --- a/docker/llm/finetune/qlora/cpu/kubernetes/Chart.yaml +++ b/docker/llm/finetune/qlora/cpu/kubernetes/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -name: ipex-fintune-service +name: ipex_llm-fintune-service description: A Helm chart for IPEX-LLM Finetune Service on Kubernetes type: application version: 1.1.27 diff --git a/docker/llm/serving/cpu/docker/README.md b/docker/llm/serving/cpu/docker/README.md index ec1b011d..f56b93a4 100644 --- a/docker/llm/serving/cpu/docker/README.md +++ b/docker/llm/serving/cpu/docker/README.md @@ -30,7 +30,7 @@ sudo docker run -itd \ After the container is booted, you could get into the container through `docker exec`. -To run model-serving using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/src/ipex/llm/serving). +To run model-serving using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/src/ipex_llm/serving/fastchat). Also you can set environment variables and start arguments while running a container to get serving started initially. You may need to boot several containers to support. One controller container and at least one worker container are needed. The api server address(host and port) and controller address are set in controller container, and you need to set the same controller address as above, model path on your machine and worker address in worker container. To start a controller container: diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md index f8d745d3..5a337b45 100644 --- a/docker/llm/serving/cpu/kubernetes/README.md +++ b/docker/llm/serving/cpu/kubernetes/README.md @@ -10,7 +10,7 @@ To deploy IPEX-LLM-serving cpu in Kubernetes environment, please use this image: In this document, we will use `vicuna-7b-v1.5` as the deployment model. -After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-ipex` to use `ipex-llm` as the backend. The `ipex-llm` backend will be used if model path contains `ipex-llm`. Otherwise, the original transformer-backend will be used. +After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-ipex-llm` to use `ipex-llm` as the backend. The `ipex-llm` backend will be used if model path contains `ipex-llm`. Otherwise, the original transformer-backend will be used. You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5). diff --git a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py index d42f3887..63f5660a 100644 --- a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -102,7 +102,7 @@ if __name__ == '__main__': # Batch tokenizing prompt = args.prompt input_ids = tokenizer.encode(prompt, return_tensors="pt").to(f'cpu:{local_rank}') - # ipex model needs a warmup, then inference time can be accurate + # ipex-llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict, use_cache=True) diff --git a/python/llm/example/CPU/LangChain/README.md b/python/llm/example/CPU/LangChain/README.md index 104440e4..0d5be536 100644 --- a/python/llm/example/CPU/LangChain/README.md +++ b/python/llm/example/CPU/LangChain/README.md @@ -1,8 +1,8 @@ ## Langchain Examples -This folder contains examples showcasing how to use `langchain` with `ipex`. +This folder contains examples showcasing how to use `langchain` with `ipex-llm`. -### Install IPEX +### Install-IPEX LLM Ensure `ipex-llm` is installed by following the [IPEX-LLM Installation Guide](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm#install). @@ -36,7 +36,7 @@ To run the example, execute the following command in the current directory: ```bash python transformers_int4/rag.py -m [-q ] [-i ] ``` -> Note: If `-i` is not specified, it will use a short introduction to Big-DL as input by default. if `-q` is not specified, `What is IPEX?` will be used by default. +> Note: If `-i` is not specified, it will use a short introduction to Big-DL as input by default. if `-q` is not specified, `What is IPEX LLM?` will be used by default. ### Example: Math @@ -66,3 +66,8 @@ python transformers_int4/voiceassistant.py -m [-q training.log diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_tile.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_tile.sh index 5a5f8c1d..d017f2da 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_tile.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_tile.sh @@ -18,6 +18,6 @@ python ./alpaca_qlora_finetuning.py \ --base_model "meta-llama/Llama-2-13b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" \ + --output_dir "./ipex-llm-qlora-alpaca" \ --micro_batch_size 8 \ --batch_size 128 diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_4_card.sh index f03716d2..78af398b 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_4_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_4_card.sh @@ -23,6 +23,6 @@ mpirun -n 8 \ python -u ./alpaca_qlora_finetuning.py \ --base_model "meta-llama/Llama-2-13b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" \ + --output_dir "./ipex-llm-qlora-alpaca" \ --micro_batch_size 8 \ --batch_size 128 > training.log diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_1_card.sh index 9f7bf380..ee6958cf 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_1_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_1_card.sh @@ -27,7 +27,7 @@ mpirun -n 2 \ python -u ./alpaca_qlora_finetuning.py \ --base_model "meta-llama/Llama-2-70b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" \ + --output_dir "./ipex-llm-qlora-alpaca" \ --gradient_checkpointing True \ --micro_batch_size 8 \ --batch_size 128 \ diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_4_card.sh index 9dead743..baa98b9e 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_4_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_4_card.sh @@ -27,7 +27,7 @@ mpirun -n 8 \ python -u ./alpaca_qlora_finetuning.py \ --base_model "meta-llama/Llama-2-70b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" \ + --output_dir "./ipex-llm-qlora-alpaca" \ --gradient_checkpointing True \ --micro_batch_size 8 \ --batch_size 128 \ diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_1_card.sh index 12056c20..5066645b 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_1_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_1_card.sh @@ -18,4 +18,4 @@ python ./alpaca_qlora_finetuning.py \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" + --output_dir "./ipex-llm-qlora-alpaca" diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_2_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_2_card.sh index cb10a142..b8431eb6 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_2_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_2_card.sh @@ -23,4 +23,4 @@ mpirun -n 2 \ python -u ./alpaca_qlora_finetuning.py \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" > training.log + --output_dir "./ipex-llm-qlora-alpaca" > training.log diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_1_card.sh index 316d4cc5..3c61b6a5 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_1_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_1_card.sh @@ -20,4 +20,4 @@ python ./alpaca_qlora_finetuning.py \ --batch_size 128 \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" + --output_dir "./ipex-llm-qlora-alpaca" diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_3_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_3_card.sh index bc9b4dcf..4b911a9f 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_3_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_3_card.sh @@ -23,7 +23,7 @@ mpirun -n 3 \ python -u ./alpaca_qlora_finetuning.py \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" \ + --output_dir "./ipex-llm-qlora-alpaca" \ --gradient_checkpointing False \ --micro_batch_size 2 \ --batch_size 128 > training.log diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_1_card.sh index 52e1a304..7deeed6a 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_1_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_1_card.sh @@ -20,4 +20,4 @@ python ./alpaca_qlora_finetuning.py \ --batch_size 128 \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" + --output_dir "./ipex-llm-qlora-alpaca" diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_4_card.sh index 8ee4dbbd..4f465b56 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_4_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_4_card.sh @@ -23,6 +23,6 @@ mpirun -n 4 \ python -u ./alpaca_qlora_finetuning.py \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" \ + --output_dir "./ipex-llm-qlora-alpaca" \ --micro_batch_size 8 \ --batch_size 128 > training.log diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_1_card.sh index 272870e9..89baa62d 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_1_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_1_card.sh @@ -23,6 +23,6 @@ mpirun -n 2 \ python -u ./alpaca_qlora_finetuning.py \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" \ + --output_dir "./ipex-llm-qlora-alpaca" \ --micro_batch_size 8 \ --batch_size 128 > training.log diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_4_card.sh index 801d88fc..16479a8c 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_4_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_4_card.sh @@ -23,6 +23,6 @@ mpirun -n 8 \ python -u ./alpaca_qlora_finetuning.py \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" \ + --output_dir "./ipex-llm-qlora-alpaca" \ --micro_batch_size 8 \ --batch_size 128 > training.log diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/README.md b/python/llm/example/GPU/LLM-Finetuning/ReLora/README.md index 4a03e162..3218948b 100644 --- a/python/llm/example/GPU/LLM-Finetuning/ReLora/README.md +++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/README.md @@ -58,8 +58,8 @@ bash relora_finetune_llama2_7b_pvc_1550_4_card.sh python ./alpaca_relora_finetuning.py \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-qlora-alpaca" \ - --resume_from_checkpoint "./ipex-qlora-alpaca/checkpoint-1100" + --output_dir "./ipex-llm-qlora-alpaca" \ + --resume_from_checkpoint "./ipex-llm-qlora-alpaca/checkpoint-1100" ``` ### 5. Sample Output diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_1_card.sh index 4bb00965..dee521e3 100644 --- a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_1_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_1_card.sh @@ -18,6 +18,6 @@ python ./alpaca_relora_finetuning.py \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-relora-alpaca" \ + --output_dir "./ipex-llm-relora-alpaca" \ --relora_steps 300 \ --relora_warmup_steps 10 diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_2_card.sh b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_2_card.sh index 3d66fd41..28bce7be 100644 --- a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_2_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_2_card.sh @@ -23,6 +23,6 @@ mpirun -n 2 \ python -u ./alpaca_relora_finetuning.py \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-relora-alpaca" \ + --output_dir "./ipex-llm-relora-alpaca" \ --relora_steps 300 \ --relora_warmup_steps 10 > training.log diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_1_card.sh index 042c25d5..3b2fbb38 100644 --- a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_1_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_1_card.sh @@ -23,7 +23,7 @@ mpirun -n 2 \ python -u ./alpaca_relora_finetuning.py \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-relora-alpaca" \ + --output_dir "./ipex-llm-relora-alpaca" \ --micro_batch_size 8 \ --relora_steps 300 \ --relora_warmup_steps 10 \ diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_4_card.sh index c2f12c90..9c152a41 100644 --- a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_4_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_4_card.sh @@ -23,7 +23,7 @@ mpirun -n 8 \ python -u ./alpaca_relora_finetuning.py \ --base_model "meta-llama/Llama-2-7b-hf" \ --data_path "yahma/alpaca-cleaned" \ - --output_dir "./ipex-relora-alpaca" \ + --output_dir "./ipex-llm-relora-alpaca" \ --micro_batch_size 8 \ --relora_steps 300 \ --relora_warmup_steps 10 \ diff --git a/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py b/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py index 8c7070ba..ee9e7193 100644 --- a/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py +++ b/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py @@ -62,7 +62,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/ModelScope-Models/generate.py b/python/llm/example/GPU/ModelScope-Models/generate.py index b4fd6637..0df692e1 100644 --- a/python/llm/example/GPU/ModelScope-Models/generate.py +++ b/python/llm/example/GPU/ModelScope-Models/generate.py @@ -60,7 +60,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = CHATGLM_V3_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py index cc6c3c48..f9454e78 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py @@ -90,7 +90,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu:0') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) output = model.generate(input_ids, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py index 98491948..6258dd2e 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py @@ -58,7 +58,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = AQUILA2_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py index 52f8adf0..9fcddbf1 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py index 215370b4..9ebb56be 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py @@ -59,7 +59,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = BAICHUAN2_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py index 1e830107..9bff3517 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py @@ -52,7 +52,7 @@ if __name__ == '__main__': inputs = processor(text, voice_preset=voice_preset).to('xpu') with torch.inference_mode(): - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate audio_array = model.generate(**inputs) st = time.time() diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py index ac6e0842..cdb9567f 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = BLUELM_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py index 71b6ceea..f7cc3938 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py @@ -58,7 +58,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = CHATGLM_V2_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py index 1e860e80..4c9dbe77 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py @@ -57,7 +57,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = args.question input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=32) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py index 1568e085..ab6ad290 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py @@ -58,7 +58,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = CHATGLM_V3_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py index 20f8b33c..89c93edb 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py @@ -57,7 +57,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = args.question input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=32) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py index 9d09c857..a9eaaace 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py @@ -59,7 +59,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = CODELLAMA_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py index a084b615..9168b46a 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py @@ -65,7 +65,7 @@ if __name__ == '__main__': prompt = DOLLY_V1_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') end_key_token_id=tokenizer.encode("### End")[0] - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, use_cache=True, max_new_tokens=args.n_predict, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py index 9445f406..82787ec9 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py @@ -65,7 +65,7 @@ if __name__ == '__main__': prompt = DOLLY_V2_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') end_key_token_id=tokenizer.encode("### End")[0] - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict, pad_token_id=tokenizer.pad_token_id, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py index 11eedd25..8c216ec4 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py @@ -60,7 +60,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = FLAN_T5_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py index 799bb62c..2c9ffa6c 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = INTERNLM_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py index 3fe07715..4add8dfb 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py @@ -62,7 +62,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py index e1b392b2..ac8a8dd9 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py @@ -54,7 +54,7 @@ if __name__ == '__main__': # Generate predicted tokens with torch.inference_mode(): input_ids = tokenizer.encode(args.prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) st = time.time() diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py index 459d23e5..80e1fc52 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py @@ -58,7 +58,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = MISTRAL_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py index ae8f0a97..ec8c3711 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py @@ -58,7 +58,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = MIXTRAL_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py index 827b59bd..fbdef847 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py @@ -56,7 +56,7 @@ if __name__ == '__main__': prompt = PHI_1_5_V1_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config) # start inference st = time.time() diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py index 3c629f84..ca7499bd 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') model.generation_config.pad_token_id = model.generation_config.eos_token_id - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config) # start inference st = time.time() diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py index 8287a37b..d1e7e7fa 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py @@ -61,7 +61,7 @@ if __name__ == '__main__': prompt = PHI1_5_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict, generation_config = generation_config) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py index 73eecb01..3edd21b7 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = REPLIT_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py index 9776a039..58e57c8d 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py @@ -89,7 +89,7 @@ if __name__ == '__main__': speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to('xpu') with torch.inference_mode(): - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) st = time.time() diff --git a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py index 380d63c3..1092fe43 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = STARCODER_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py index d08d6087..61fe372e 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py @@ -55,7 +55,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = YI_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py index 6caec894..d3a94b07 100644 --- a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py @@ -62,7 +62,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py b/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py index 9a289568..fe9fff65 100644 --- a/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py @@ -73,7 +73,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - # ipex model needs a warmup, then inference time can be accurate + # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/src/ipex_llm/serving/fastchat/README.md b/python/llm/src/ipex_llm/serving/fastchat/README.md index 4f741792..ec905219 100644 --- a/python/llm/src/ipex_llm/serving/fastchat/README.md +++ b/python/llm/src/ipex_llm/serving/fastchat/README.md @@ -52,16 +52,14 @@ python3 -m fastchat.serve.controller Using IPEX-LLM in FastChat does not impose any new limitations on model usage. Therefore, all Hugging Face Transformer models can be utilized in FastChat. #### IPEX-LLM model worker (deprecated) -
-details > Warning: This method has been deprecated, please change to use `IPEX-LLM` [worker](#ipex-llm-worker) instead. FastChat determines the Model adapter to use through path matching. Therefore, in order to load models using IPEX-LLM, you need to make some modifications to the model's name. -For instance, assuming you have downloaded the `llama-7b-hf` from [HuggingFace](https://huggingface.co/decapoda-research/llama-7b-hf). Then, to use the `IPEX-LLM` as backend, you need to change name from `llama-7b-hf` to `ipex-7b`.The key point here is that the model's path should include "ipex" and **should not include paths matched by other model adapters**. +For instance, assuming you have downloaded the `llama-7b-hf` from [HuggingFace](https://huggingface.co/decapoda-research/llama-7b-hf). Then, to use the `IPEX-LLM` as backend, you need to change name from `llama-7b-hf` to `ipex-llm-7b`.The key point here is that the model's path should include "ipex" and **should not include paths matched by other model adapters**. -Then we will use `ipex-7b` as model-path. +Then we will use `ipex-llm-7b` as model-path. > note: This is caused by the priority of name matching list. The new added `IPEX-LLM` adapter is at the tail of the name-matching list so that it has the lowest priority. If model path contains other keywords like `vicuna` which matches to another adapter with higher priority, then the `IPEX-LLM` adapter will not work. @@ -71,13 +69,13 @@ Then we can run model workers ```bash # On CPU -python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/ipex-7b --device cpu +python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/ipex-llm-7b --device cpu # On GPU -python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/ipex-7b --device xpu +python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/ipex-llm-7b --device xpu ``` -If you run successfully using `IPEX` backend, you can see the output in log like this: +If you run successfully using `ipex_llm` backend, you can see the output in log like this: ```bash INFO - Converting the current model to sym_int4 format...... @@ -87,9 +85,11 @@ INFO - Converting the current model to sym_int4 format......
#### IPEX-LLM worker + To integrate IPEX-LLM with `FastChat` efficiently, we have provided a new model_worker implementation named `ipex_llm_worker.py`. To run the `ipex_llm_worker` on CPU, using the following code: + ```bash source ipex-llm-init -t @@ -97,8 +97,8 @@ source ipex-llm-init -t python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu" ``` - For GPU example: + ```bash # Available low_bit format including sym_int4, sym_int8, fp16 etc. python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu" diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index 820dbecc..4e568e7f 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -644,7 +644,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True, f"format......") modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert - # using ipex optimizer before changing to bigdl linear + # using ipex_llm optimizer before changing to bigdl linear _enable_ipex = get_enable_ipex() if _enable_ipex: diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index 35093106..d5e35d3e 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -85,7 +85,7 @@ Q2_K = ggml_tensor_qtype["q2_k"] # new_weight_tile is directly VNNI packed, but I did not find significant # performance improvement. # -# Note this format cannot be used directly in IPEX's mm_int4, which expects +# Note this format cannot be used directly in IPEX-LLM's mm_int4, which expects # row major but packing two consecutive columns. def q4_0_xpu_transpose(ggml_weight, weight_shape): from ipex_llm.transformers.low_bit_linear import get_block_size diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm2.py b/python/llm/src/ipex_llm/transformers/models/chatglm2.py index 3308e93f..31a86245 100644 --- a/python/llm/src/ipex_llm/transformers/models/chatglm2.py +++ b/python/llm/src/ipex_llm/transformers/models/chatglm2.py @@ -227,7 +227,7 @@ def chatglm2_quantized_attention_forward_8eb45c( key_layer = key_layer.transpose(0, 1) query_layer_cur = query_layer[..., :rot_dim] key_layer_cur = key_layer[..., :rot_dim] - # ipex's apply_rotary_embedding can change the origin storage, so query_layer will get + # ipex_llm's apply_rotary_embedding can change the origin storage, so query_layer will get # the result directly. torch.ops.torch_ipex.apply_rotary_embedding(query_layer_cur, sin, cos, query_layer_cur) torch.ops.torch_ipex.apply_rotary_embedding(key_layer_cur, sin, cos, key_layer_cur) @@ -367,7 +367,7 @@ def chatglm2_attention_forward_8eb45c( key_layer = key_layer.transpose(0, 1) query_layer_cur = query_layer[..., :rot_dim] key_layer_cur = key_layer[..., :rot_dim] - # ipex's apply_rotary_embedding can change the origin storage, so query_layer will get + # ipex_llm's apply_rotary_embedding can change the origin storage, so query_layer will get # the result directly. torch.ops.torch_ipex.apply_rotary_embedding(query_layer_cur, sin, cos, query_layer_cur) torch.ops.torch_ipex.apply_rotary_embedding(key_layer_cur, sin, cos, key_layer_cur) diff --git a/python/llm/src/ipex_llm/transformers/speculative.py b/python/llm/src/ipex_llm/transformers/speculative.py index 1c781e66..2585890d 100644 --- a/python/llm/src/ipex_llm/transformers/speculative.py +++ b/python/llm/src/ipex_llm/transformers/speculative.py @@ -565,7 +565,7 @@ def speculative_generate(self, ("mistral" in self.config.model_type) or ("qwen" in self.config.model_type) or ("chatglm" in self.config.model_type)): - invalidInputError(False, "BigDL Speculative Decoding with IPEX only supports \ + invalidInputError(False, "BigDL Speculative Decoding with IPEX-LLM only supports \ Llama, Baichuan2, Mistral, ChatGLM and Qwen models currently.") if "chatglm" in self.config.model_type: global query_group_size @@ -726,7 +726,7 @@ def speculative_generate(self, past_key_values=draft_past_key_values, ) else: - invalidInputError(False, "BigDL Speculative Decoding with IPEX only supports \ + invalidInputError(False, "BigDL Speculative Decoding with IPEX-LLM only supports \ Llama, Baichuan2, Mistral, ChatGLM and Qwen models currently.") draft_output = CausalLMOutputWithPast(