diff --git a/python/llm/dev/benchmark/all-in-one/run-deepspeed-arc.sh b/python/llm/dev/benchmark/all-in-one/run-deepspeed-arc.sh index 42354272..9a24eedc 100644 --- a/python/llm/dev/benchmark/all-in-one/run-deepspeed-arc.sh +++ b/python/llm/dev/benchmark/all-in-one/run-deepspeed-arc.sh @@ -14,5 +14,5 @@ if grep -q "Core" /proc/cpuinfo; then export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 fi export TORCH_LLM_ALLREDUCE=0 # Different from PVC - +export BIGDL_IMPORT_IPEX=0 mpirun -np $NUM_GPUS --prepend-rank python run.py diff --git a/python/llm/dev/benchmark/all-in-one/run-deepspeed-pvc.sh b/python/llm/dev/benchmark/all-in-one/run-deepspeed-pvc.sh index 16d14831..a0998477 100644 --- a/python/llm/dev/benchmark/all-in-one/run-deepspeed-pvc.sh +++ b/python/llm/dev/benchmark/all-in-one/run-deepspeed-pvc.sh @@ -13,4 +13,5 @@ source $basekit_root/ccl/latest/env/vars.sh --force export OMP_NUM_THREADS=$((56/$NUM_GPUS)) export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 export TORCH_LLM_ALLREDUCE=1 +export BIGDL_IMPORT_IPEX=0 mpirun -np $NUM_GPUS --prepend-rank python run.py diff --git a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/start-deepspeed-autotp-ipex-llm-serving.sh b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/start-deepspeed-autotp-ipex-llm-serving.sh index c3d3bd85..9e7e818c 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/start-deepspeed-autotp-ipex-llm-serving.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/start-deepspeed-autotp-ipex-llm-serving.sh @@ -31,6 +31,6 @@ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 export TORCH_LLM_ALLREDUCE=0 export WORLD_SIZE=2 - +export BIGDL_IMPORT_IPEX=0 mpirun -np $NUM_GPUS --prepend-rank \ python serving.py --repo-id-or-model-path YOUR_REPO_ID_OR_MODEL_PATH --low-bit 'fp8' --port 8000 --max-num-seqs 8 --max-num-batched-tokens 8192 diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh index 4e968541..37e53545 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh @@ -29,5 +29,6 @@ source $basekit_root/ccl/latest/env/vars.sh --force export OMP_NUM_THREADS=$((56/$NUM_GPUS)) export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 export TORCH_LLM_ALLREDUCE=1 +export BIGDL_IMPORT_IPEX=0 mpirun -np $NUM_GPUS --prepend-rank \ python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh index ed471962..177ed5d0 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh @@ -28,6 +28,6 @@ NUM_GPUS=2 # number of used GPU export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 export TORCH_LLM_ALLREDUCE=0 # Different from PVC - +export BIGDL_IMPORT_IPEX=0 mpirun -np $NUM_GPUS --prepend-rank \ python deepspeed_autotp.py --repo-id-or-model-path 'mistralai/Mistral-7B-Instruct-v0.1' --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh index 0b45569b..6686d3ee 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh @@ -33,6 +33,6 @@ if grep -q "Core" /proc/cpuinfo; then export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 fi export TORCH_LLM_ALLREDUCE=0 # Different from PVC - +export BIGDL_IMPORT_IPEX=0 mpirun -np $NUM_GPUS --prepend-rank \ python deepspeed_autotp.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh index 1e23668f..7cf50a5e 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh @@ -30,6 +30,6 @@ if grep -q "Core" /proc/cpuinfo; then export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 fi export TORCH_LLM_ALLREDUCE=0 # Different from PVC - +export BIGDL_IMPORT_IPEX=0 mpirun -np $NUM_GPUS --prepend-rank \ python deepspeed_autotp.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --low-bit 'sym_int4'