Update tests for transformers 4.36 (#10858)

* update unit test

* update

* update

* update

* update

* update

* fix gpu attention test

* update

* update

* update

* update

* update

* update

* update example test

* replace replit code

* update

* update

* update

* update

* set safe_serialization false

* perf test

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* delete

* update

* update

* update

* update

* update

* update

* revert

* update
This commit is contained in:
Jiao Wang 2024-05-23 19:26:38 -07:00 committed by GitHub
parent 1291165720
commit 0a06a6e1d4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
28 changed files with 90 additions and 409 deletions

View file

@ -164,12 +164,6 @@ jobs:
shell: bash shell: bash
run: | run: |
pip install --upgrade datasets==2.14.6 pip install --upgrade datasets==2.14.6
if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
pip install --upgrade transformers==4.36
else
pip install --upgrade transformers==4.31
fi
- name: Run harness - name: Run harness
shell: bash shell: bash

View file

@ -144,16 +144,11 @@ jobs:
echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV" echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV"
MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/ MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/
wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR} wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
- name: Upgrade packages - name: Upgrade packages
shell: bash shell: bash
run: | run: |
pip install --upgrade datasets==2.14.6 pip install --upgrade datasets==2.14.6
if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
pip install --upgrade transformers==4.36
else
pip install --upgrade transformers==4.31
fi
- name: Run perplexity - name: Run perplexity
shell: bash shell: bash

View file

@ -87,12 +87,11 @@ jobs:
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
bash python/llm/test/run-llm-install-tests.sh bash python/llm/test/run-llm-install-tests.sh
- name: Test on xpu(transformers==4.31.0) - name: Test on xpu(transformers==4.36.2)
shell: bash shell: bash
run: | run: |
date_for_test_version=$(date -d yesterday +%Y-%m-%d) date_for_test_version=$(date -d yesterday +%Y-%m-%d)
sed -i "s/date.today()/\"$date_for_test_version\"/g" python/llm/dev/benchmark/all-in-one/run.py sed -i "s/date.today()/\"$date_for_test_version\"/g" python/llm/dev/benchmark/all-in-one/run.py
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
export USE_XETLA=OFF export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
@ -104,20 +103,6 @@ jobs:
sed -i 's/{today}/{today}_test1/g' run.py sed -i 's/{today}/{today}_test1/g' run.py
python run.py python run.py
- name: Test on xpu(transformers==4.34.0)
shell: bash
run: |
source /opt/intel/oneapi/setvars.sh
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
# upgrade transformers for model Mistral-7B-v0.1
python -m pip install transformers==4.34.0
cp python/llm/test/benchmark/arc-perf-transformers-434.yaml python/llm/dev/benchmark/all-in-one/config.yaml
cd python/llm/dev/benchmark/all-in-one
# change csv name
sed -i 's/test1/test2/g' run.py
python run.py
- name: Test on xpu(transformers==4.37.0) - name: Test on xpu(transformers==4.37.0)
shell: bash shell: bash
run: | run: |
@ -129,7 +114,7 @@ jobs:
cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml
cd python/llm/dev/benchmark/all-in-one cd python/llm/dev/benchmark/all-in-one
# change csv name # change csv name
sed -i 's/test2/test3/g' run.py sed -i 's/test1/test2/g' run.py
python run.py python run.py
- name: Concat csv and generate html - name: Concat csv and generate html
@ -151,7 +136,7 @@ jobs:
run: | run: |
cd python/llm/dev/benchmark/all-in-one cd python/llm/dev/benchmark/all-in-one
python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml
python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-434.yaml python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-437.yaml
find . -name "*test*.csv" -delete find . -name "*test*.csv" -delete
if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then
curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/ curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/
@ -279,6 +264,7 @@ jobs:
exit 1 exit 1
fi fi
- name: Test on core ${{ matrix.platform }} - name: Test on core ${{ matrix.platform }}
shell: bash shell: bash
run: | run: |
@ -325,8 +311,8 @@ jobs:
# - name: Prepare for install ipex-llm from source # - name: Prepare for install ipex-llm from source
# shell: bash # shell: bash
# run: | # run: |
# sed -i 's/"bigdl-core-xe-21==" + VERSION + "/"bigdl-core-xe-21/g' python/llm/setup.py # sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
# sed -i 's/"bigdl-core-xe-21==" + VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py # sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
# - name: Install ipex-llm and other related packages (install from source) # - name: Install ipex-llm and other related packages (install from source)
# shell: cmd # shell: cmd
@ -426,33 +412,10 @@ jobs:
call conda deactivate call conda deactivate
- name: Prepare igpu perf test for Mistral (32-32)
shell: bash
run: |
sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_434.yaml
- name: Test on igpu for Mistral (32-32)
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.34.0
set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
cd python\llm\dev\benchmark\all-in-one
move ..\..\..\test\benchmark\igpu-perf\32-32_434.yaml config.yaml
set PYTHONIOENCODING=utf-8
python run.py >> %CSV_SAVE_PATH%\32-32\log\%LOG_FILE% 2>&1
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Prepare igpu perf test for Qwen1.5 (32-32) - name: Prepare igpu perf test for Qwen1.5 (32-32)
shell: bash shell: bash
run: | run: |
sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_437.yaml sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_437.yaml
- name: Test on igpu for Qwen1.5 (32-32) - name: Test on igpu for Qwen1.5 (32-32)
@ -495,14 +458,14 @@ jobs:
shell: bash shell: bash
run: | run: |
sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128.yaml sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128.yaml
- name: Test on igpu (1024-128) - name: Test on igpu (1024-128)
shell: cmd shell: cmd
run: | run: |
call conda activate igpu-perf call conda activate igpu-perf
pip install transformers==4.31.0 pip install transformers==4.36.2
set SYCL_CACHE_PERSISTENT=1 set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1 set BIGDL_LLM_XMX_DISABLED=1
@ -517,33 +480,10 @@ jobs:
call conda deactivate call conda deactivate
- name: Prepare igpu perf test for Mistral (1024-128)
shell: bash
run: |
sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_434.yaml
- name: Test on igpu for Mistral (1024-128)
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.34.0
set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
cd python\llm\dev\benchmark\all-in-one
move ..\..\..\test\benchmark\igpu-perf\1024-128_434.yaml config.yaml
set PYTHONIOENCODING=utf-8
python run.py >> %CSV_SAVE_PATH%\1024-128\log\%LOG_FILE% 2>&1
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Prepare igpu perf test for Qwen 1.5 (1024-128) - name: Prepare igpu perf test for Qwen 1.5 (1024-128)
shell: bash shell: bash
run: | run: |
sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_437.yaml sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_437.yaml
- name: Test on igpu for Qwen 1.5 (1024-128) - name: Test on igpu for Qwen 1.5 (1024-128)
@ -585,14 +525,14 @@ jobs:
shell: bash shell: bash
run: | run: |
sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256.yaml sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256.yaml
- name: Test on igpu (2048-256) - name: Test on igpu (2048-256)
shell: cmd shell: cmd
run: | run: |
call conda activate igpu-perf call conda activate igpu-perf
pip install transformers==4.31.0 pip install transformers==4.36.2
set SYCL_CACHE_PERSISTENT=1 set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1 set BIGDL_LLM_XMX_DISABLED=1
@ -607,33 +547,10 @@ jobs:
call conda deactivate call conda deactivate
- name: Prepare igpu perf test for Mistral (2048-256)
shell: bash
run: |
sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_434.yaml
- name: Test on igpu for Mistral (2048-256)
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.34.0
set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
cd python\llm\dev\benchmark\all-in-one
move ..\..\..\test\benchmark\igpu-perf\2048-256_434.yaml config.yaml
set PYTHONIOENCODING=utf-8
python run.py >> %CSV_SAVE_PATH%\2048-256\log\%LOG_FILE% 2>&1
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Prepare igpu perf test for Qwen 1.5 (2048-256) - name: Prepare igpu perf test for Qwen 1.5 (2048-256)
shell: bash shell: bash
run: | run: |
sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_437.yaml sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_437.yaml
- name: Test on igpu for Qwen 1.5 (2048-256) - name: Test on igpu for Qwen 1.5 (2048-256)
@ -675,14 +592,14 @@ jobs:
shell: bash shell: bash
run: | run: |
sed -i 's/2048-256/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/2048-256/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
- name: Test on igpu (load_low_bit 1024-128) - name: Test on igpu (load_low_bit 1024-128)
shell: cmd shell: cmd
run: | run: |
call conda activate igpu-perf call conda activate igpu-perf
pip install transformers==4.31.0 pip install transformers==4.36.2
set SYCL_CACHE_PERSISTENT=1 set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1 set BIGDL_LLM_XMX_DISABLED=1
@ -697,33 +614,10 @@ jobs:
call conda deactivate call conda deactivate
- name: Prepare igpu perf test for Mistral (load_low_bit 1024-128)
shell: bash
run: |
sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml
- name: Test on igpu for Mistral (load_low_bit 1024-128)
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.34.0
set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
cd python\llm\dev\benchmark\all-in-one
move ..\..\..\test\benchmark\igpu-perf\1024-128_loadlowbit_434.yaml config.yaml
set PYTHONIOENCODING=utf-8
python run.py >> %CSV_SAVE_PATH%\1024-128_loadlowbit\log\%LOG_FILE% 2>&1
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Prepare igpu perf test for Qwen 1.5 (load_low_bit 1024-128) - name: Prepare igpu perf test for Qwen 1.5 (load_low_bit 1024-128)
shell: bash shell: bash
run: | run: |
sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_437.yaml sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_437.yaml
- name: Test on igpu for Qwen 1.5 (load_low_bit 1024-128) - name: Test on igpu for Qwen 1.5 (load_low_bit 1024-128)
@ -763,14 +657,14 @@ jobs:
- name: Prepare igpu perf test (int4+fp16 1024-128) - name: Prepare igpu perf test (int4+fp16 1024-128)
shell: bash shell: bash
run: | run: |
sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
- name: Test on igpu (int4+fp16 1024-128) - name: Test on igpu (int4+fp16 1024-128)
shell: cmd shell: cmd
run: | run: |
call conda activate igpu-perf call conda activate igpu-perf
pip install transformers==4.31.0 pip install transformers==4.36.2
set SYCL_CACHE_PERSISTENT=1 set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1 set BIGDL_LLM_XMX_DISABLED=1
@ -785,33 +679,10 @@ jobs:
call conda deactivate call conda deactivate
- name: Prepare igpu perf test for Mistral (int4+fp16 1024-128)
shell: bash
run: |
sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml
- name: Test on igpu for Mistral (int4+fp16 1024-128)
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.34.0
set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
cd python\llm\dev\benchmark\all-in-one
move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_434.yaml config.yaml
set PYTHONIOENCODING=utf-8
python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Prepare igpu perf test for Qwen 1.5 (int4+fp16 1024-128) - name: Prepare igpu perf test for Qwen 1.5 (int4+fp16 1024-128)
shell: bash shell: bash
run: | run: |
sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml
- name: Test on igpu for Qwen 1.5 (int4+fp16 1024-128) - name: Test on igpu for Qwen 1.5 (int4+fp16 1024-128)

View file

@ -99,7 +99,7 @@ jobs:
echo "LLAMA_ORIGIN_PATH=${ORIGIN_DIR}/llama-7b-hf" >> "$GITHUB_ENV" echo "LLAMA_ORIGIN_PATH=${ORIGIN_DIR}/llama-7b-hf" >> "$GITHUB_ENV"
echo "BLOOM_ORIGIN_PATH=${ORIGIN_DIR}/bloom-7b1" >> "$GITHUB_ENV" echo "BLOOM_ORIGIN_PATH=${ORIGIN_DIR}/bloom-7b1" >> "$GITHUB_ENV"
echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV" echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
echo "ORIGINAL_REPLIT_CODE_PATH=${ORIGIN_DIR}/replit-code-v1-3b" >> "$GITHUB_ENV" echo "ORIGINAL_CODESHELL_7B_PATH=${ORIGIN_DIR}/CodeShell-7B-Chat" >> "$GITHUB_ENV"
echo "ORIGINAL_WHISPER_TINY_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV" echo "ORIGINAL_WHISPER_TINY_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV"
echo "MISTRAL_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-v0.1" >> "$GITHUB_ENV" echo "MISTRAL_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-v0.1" >> "$GITHUB_ENV"
echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV" echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
@ -157,13 +157,13 @@ jobs:
# fi # fi
if [ ! -d $ORIGINAL_CHATGLM2_6B_PATH ]; then if [ ! -d $ORIGINAL_CHATGLM2_6B_PATH ]; then
echo "Directory $ORIGINAL_CHATGLM2_6B_PATH not found. Downloading from FTP server..." echo "Directory $ORIGINAL_CHATGLM2_6B_PATH not found. Downloading from FTP server..."
echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR" echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR"
wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
fi fi
if [ ! -d $ORIGINAL_REPLIT_CODE_PATH ]; then if [ ! -d $ORIGINAL_CODESHELL_7B_PATH ]; then
echo "Directory $ORIGINAL_REPLIT_CODE_PATH not found. Downloading from FTP server..." echo "Directory $ORIGINAL_CODESHELL_7B_PATH not found. Downloading from FTP server..."
echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR" echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/CodeShell-7B-Chat -P $ORIGIN_DIR"
wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/CodeShell-7B-Chat -P $ORIGIN_DIR
fi fi
if [ ! -d $ORIGINAL_WHISPER_TINY_PATH ]; then if [ ! -d $ORIGINAL_WHISPER_TINY_PATH ]; then
echo "Directory $ORIGINAL_WHISPER_TINY_PATH not found. Downloading from FTP server..." echo "Directory $ORIGINAL_WHISPER_TINY_PATH not found. Downloading from FTP server..."
@ -226,7 +226,7 @@ jobs:
shell: bash shell: bash
run: | run: |
pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface
pip install transformers==4.36.0 pip install transformers==4.36.2
pip install "pydantic>=2.0.0" pip install "pydantic>=2.0.0"
bash python/llm/test/run-llm-llamaindex-tests.sh bash python/llm/test/run-llm-llamaindex-tests.sh
- name: Run sentence-transformers uninstallation - name: Run sentence-transformers uninstallation
@ -234,6 +234,7 @@ jobs:
shell: bash shell: bash
run: | run: |
pip uninstall sentence-transformers -y || true pip uninstall sentence-transformers -y || true
llm-unit-test-on-arc: llm-unit-test-on-arc:
needs: [setup-python-version, llm-cpp-build] needs: [setup-python-version, llm-cpp-build]
strategy: strategy:
@ -364,8 +365,6 @@ jobs:
fi fi
python -m pip install datasets librosa soundfile einops tiktoken transformers_stream_generator python -m pip install datasets librosa soundfile einops tiktoken transformers_stream_generator
bash python/llm/test/run-llm-inference-tests-gpu.sh bash python/llm/test/run-llm-inference-tests-gpu.sh
python -m pip install transformers==4.34.0
bash python/llm/test/run-llm-inference-tests-gpu-434.sh
- name: Run LLM example tests - name: Run LLM example tests
shell: bash shell: bash
@ -428,7 +427,7 @@ jobs:
pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
source /home/arda/intel/oneapi/setvars.sh source /home/arda/intel/oneapi/setvars.sh
fi fi
pip install transformers==4.36.0 pip install transformers==4.36.2
pip install "pydantic>=2.0.0" pip install "pydantic>=2.0.0"
bash python/llm/test/run-llm-llamaindex-tests-gpu.sh bash python/llm/test/run-llm-llamaindex-tests-gpu.sh
- name: Run sentence-transformers uninstallation - name: Run sentence-transformers uninstallation

View file

@ -53,7 +53,7 @@ libs_dir = os.path.join(llm_home, "ipex_llm", "libs")
cpu_torch_version = ["torch==2.1.2+cpu;platform_system=='Linux'", "torch==2.1.2;platform_system=='Windows'"] cpu_torch_version = ["torch==2.1.2+cpu;platform_system=='Linux'", "torch==2.1.2;platform_system=='Windows'"]
CONVERT_DEP = ['numpy == 1.26.4', # lastet 2.0.0b1 will cause error CONVERT_DEP = ['numpy == 1.26.4', # lastet 2.0.0b1 will cause error
'transformers == 4.31.0', 'sentencepiece', 'tokenizers == 0.13.3', 'transformers == 4.36.2', 'sentencepiece', 'tokenizers == 0.15.2',
# TODO: Support accelerate 0.22.0 # TODO: Support accelerate 0.22.0
'accelerate == 0.21.0', 'tabulate'] + cpu_torch_version 'accelerate == 0.21.0', 'tabulate'] + cpu_torch_version
@ -279,11 +279,10 @@ def setup_package():
# Add internal requires for llama-index # Add internal requires for llama-index
llama_index_requires = copy.deepcopy(all_requires) llama_index_requires = copy.deepcopy(all_requires)
for exclude_require in ['transformers == 4.31.0', 'tokenizers == 0.13.3'] + cpu_torch_version: for exclude_require in cpu_torch_version:
llama_index_requires.remove(exclude_require) llama_index_requires.remove(exclude_require)
llama_index_requires += ["setuptools<70.0.0"] llama_index_requires += ["setuptools<70.0.0"]
llama_index_requires += ["torch<2.2.0", llama_index_requires += ["torch<2.2.0",
"transformers>=4.34.0,<4.39.0",
"sentence-transformers~=2.6.1"] "sentence-transformers~=2.6.1"]

View file

@ -47,7 +47,8 @@ def _save_low_bit(self, save_dir, *args, **kwargs):
if isinstance(self, PreTrainedModel): if isinstance(self, PreTrainedModel):
# We borrowed this method to adapt to Transformer model cases # We borrowed this method to adapt to Transformer model cases
# as much as possible, and later we may merge these two situations # as much as possible, and later we may merge these two situations
self.save_pretrained(save_dir) kwargs['safe_serialization'] = False
self.save_pretrained(save_dir, *args, **kwargs)
else: else:
# TODO: For the lowbit model still larger than 8GB, # TODO: For the lowbit model still larger than 8GB,
# save it into shards. # save it into shards.

View file

@ -10,13 +10,14 @@ repo_id:
- 'databricks/dolly-v1-6b' - 'databricks/dolly-v1-6b'
- 'databricks/dolly-v2-7b' - 'databricks/dolly-v2-7b'
- 'databricks/dolly-v2-12b' - 'databricks/dolly-v2-12b'
- 'internlm/internlm-chat-7b-8k' - 'internlm/internlm-chat-7b'
- 'Qwen/Qwen-7B-Chat' - 'Qwen/Qwen-7B-Chat'
- 'BAAI/AquilaChat-7B' - 'BAAI/AquilaChat-7B'
- 'baichuan-inc/Baichuan2-7B-Chat' - 'baichuan-inc/Baichuan2-7B-Chat'
- 'baichuan-inc/Baichuan2-13B-Chat-4bit' - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
- 'bigscience/bloomz-7b1' - 'bigscience/bloomz-7b1'
- 'fnlp/moss-moon-003-sft-4bit' # - 'fnlp/moss-moon-003-sft-4bit' # moss-moon-003-sft cannot work on transformers 4.34+
- 'mistralai/Mistral-7B-v0.1'
local_model_hub: '/mnt/disk1/models' local_model_hub: '/mnt/disk1/models'
warm_up: 1 warm_up: 1
num_trials: 3 num_trials: 3
@ -31,7 +32,7 @@ test_api:
- "transformer_int4_gpu" # on Intel GPU - "transformer_int4_gpu" # on Intel GPU
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
exclude: exclude:
- 'fnlp/moss-moon-003-sft-4bit:1024' # - 'fnlp/moss-moon-003-sft-4bit:1024'
- 'fnlp/moss-moon-003-sft-4bit:2048' # - 'fnlp/moss-moon-003-sft-4bit:2048'
- 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
- 'bigscience/bloomz-7b1:2048' - 'bigscience/bloomz-7b1:2048'

View file

@ -1,16 +0,0 @@
# For the models that require transformers 4.34.0
repo_id:
- 'mistralai/Mistral-7B-v0.1'
local_model_hub: '/mnt/disk1/models'
warm_up: 1
num_trials: 3
num_beams: 1 # default to greedy search
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
batch_size: 1 # default to 1
in_out_pairs:
- '32-32'
- '1024-128'
- '2048-256'
test_api:
- "transformer_int4_gpu" # on Intel GPU
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)

View file

@ -12,10 +12,11 @@ repo_id:
- 'WisdomShell/CodeShell-7B-Chat' - 'WisdomShell/CodeShell-7B-Chat'
- 'tiiuae/falcon-7b-instruct-with-patch' - 'tiiuae/falcon-7b-instruct-with-patch'
- 'mosaicml/mpt-7b-chat' - 'mosaicml/mpt-7b-chat'
- 'liuhaotian/llava-v1.5-7b' # - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
- 'RWKV/rwkv-4-world-7b' - 'RWKV/rwkv-4-world-7b'
- 'RWKV/rwkv-5-world-7b' - 'RWKV/rwkv-5-world-7b'
- 'IEITYuan/Yuan2-2B-hf' - 'IEITYuan/Yuan2-2B-hf'
- 'mistralai/Mistral-7B-Instruct-v0.1'
local_model_hub: 'path to your local model hub' local_model_hub: 'path to your local model hub'
warm_up: 1 warm_up: 1
num_trials: 3 num_trials: 3

View file

@ -1,13 +0,0 @@
repo_id:
- 'mistralai/Mistral-7B-Instruct-v0.1'
local_model_hub: 'path to your local model hub'
warm_up: 1
num_trials: 3
num_beams: 1 # default to greedy search
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
batch_size: 1 # default to 1
in_out_pairs:
- '1024-128'
test_api:
- "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)

View file

@ -12,10 +12,11 @@ repo_id:
- 'WisdomShell/CodeShell-7B-Chat' - 'WisdomShell/CodeShell-7B-Chat'
- 'tiiuae/falcon-7b-instruct-with-patch' - 'tiiuae/falcon-7b-instruct-with-patch'
- 'mosaicml/mpt-7b-chat' - 'mosaicml/mpt-7b-chat'
- 'liuhaotian/llava-v1.5-7b' # - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
# - 'RWKV/rwkv-4-world-7b' # - 'RWKV/rwkv-4-world-7b'
# - 'RWKV/rwkv-5-world-7b' # - 'RWKV/rwkv-5-world-7b'
- 'IEITYuan/Yuan2-2B-hf' - 'IEITYuan/Yuan2-2B-hf'
- 'mistralai/Mistral-7B-Instruct-v0.1'
local_model_hub: 'path to your local model hub' local_model_hub: 'path to your local model hub'
warm_up: 1 warm_up: 1
num_trials: 3 num_trials: 3

View file

@ -1,13 +0,0 @@
repo_id:
- 'mistralai/Mistral-7B-Instruct-v0.1'
local_model_hub: 'path to your local model hub'
warm_up: 1
num_trials: 3
num_beams: 1 # default to greedy search
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
batch_size: 1 # default to 1
in_out_pairs:
- '1024-128'
test_api:
- "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)

View file

@ -12,10 +12,11 @@ repo_id:
- 'WisdomShell/CodeShell-7B-Chat' - 'WisdomShell/CodeShell-7B-Chat'
- 'tiiuae/falcon-7b-instruct-with-patch' - 'tiiuae/falcon-7b-instruct-with-patch'
- 'mosaicml/mpt-7b-chat' - 'mosaicml/mpt-7b-chat'
- 'liuhaotian/llava-v1.5-7b' # - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
- 'RWKV/rwkv-4-world-7b' - 'RWKV/rwkv-4-world-7b'
- 'RWKV/rwkv-5-world-7b' - 'RWKV/rwkv-5-world-7b'
- 'IEITYuan/Yuan2-2B-hf' - 'IEITYuan/Yuan2-2B-hf'
- 'mistralai/Mistral-7B-Instruct-v0.1'
local_model_hub: 'path to your local model hub' local_model_hub: 'path to your local model hub'
warm_up: 1 warm_up: 1
num_trials: 3 num_trials: 3

View file

@ -1,13 +0,0 @@
repo_id:
- 'mistralai/Mistral-7B-Instruct-v0.1'
local_model_hub: 'path to your local model hub'
warm_up: 1
num_trials: 3
num_beams: 1 # default to greedy search
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
batch_size: 1 # default to 1
in_out_pairs:
- '1024-128'
test_api:
- "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)

View file

@ -12,10 +12,11 @@ repo_id:
- 'WisdomShell/CodeShell-7B-Chat' - 'WisdomShell/CodeShell-7B-Chat'
- 'tiiuae/falcon-7b-instruct-with-patch' - 'tiiuae/falcon-7b-instruct-with-patch'
- 'mosaicml/mpt-7b-chat' - 'mosaicml/mpt-7b-chat'
- 'liuhaotian/llava-v1.5-7b' # - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
- 'RWKV/rwkv-4-world-7b' - 'RWKV/rwkv-4-world-7b'
- 'RWKV/rwkv-5-world-7b' - 'RWKV/rwkv-5-world-7b'
- 'IEITYuan/Yuan2-2B-hf' - 'IEITYuan/Yuan2-2B-hf'
- 'mistralai/Mistral-7B-Instruct-v0.1'
local_model_hub: 'path to your local model hub' local_model_hub: 'path to your local model hub'
warm_up: 1 warm_up: 1
num_trials: 3 num_trials: 3

View file

@ -1,13 +0,0 @@
repo_id:
- 'mistralai/Mistral-7B-Instruct-v0.1'
local_model_hub: 'path to your local model hub'
warm_up: 1
num_trials: 3
num_beams: 1 # default to greedy search
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
batch_size: 1 # default to 1
in_out_pairs:
- '2048-256'
test_api:
- "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)

View file

@ -12,10 +12,11 @@ repo_id:
- 'WisdomShell/CodeShell-7B-Chat' - 'WisdomShell/CodeShell-7B-Chat'
- 'tiiuae/falcon-7b-instruct-with-patch' - 'tiiuae/falcon-7b-instruct-with-patch'
- 'mosaicml/mpt-7b-chat' - 'mosaicml/mpt-7b-chat'
- 'liuhaotian/llava-v1.5-7b' # - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
- 'RWKV/rwkv-4-world-7b' - 'RWKV/rwkv-4-world-7b'
- 'RWKV/rwkv-5-world-7b' - 'RWKV/rwkv-5-world-7b'
- 'IEITYuan/Yuan2-2B-hf' - 'IEITYuan/Yuan2-2B-hf'
- 'mistralai/Mistral-7B-Instruct-v0.1'
local_model_hub: 'path to your local model hub' local_model_hub: 'path to your local model hub'
warm_up: 3 warm_up: 3
num_trials: 5 num_trials: 5

View file

@ -1,13 +0,0 @@
repo_id:
- 'mistralai/Mistral-7B-Instruct-v0.1'
local_model_hub: 'path to your local model hub'
warm_up: 3
num_trials: 5
num_beams: 1 # default to greedy search
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
batch_size: 1 # default to 1
in_out_pairs:
- '32-32'
test_api:
- "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)

View file

@ -6,7 +6,7 @@ repo_id:
- 'baichuan-inc/Baichuan2-7B-Chat' - 'baichuan-inc/Baichuan2-7B-Chat'
- 'baichuan-inc/Baichuan2-13B-Chat' - 'baichuan-inc/Baichuan2-13B-Chat'
- 'Qwen/Qwen-14B-Chat' - 'Qwen/Qwen-14B-Chat'
local_model_hub: '/models' local_model_hub: '/mnt/disk1/models'
warm_up: 1 warm_up: 1
num_trials: 3 num_trials: 3
num_beams: 1 # default to greedy search num_beams: 1 # default to greedy search

View file

@ -6,7 +6,7 @@ repo_id:
- 'baichuan-inc/Baichuan2-7B-Chat' - 'baichuan-inc/Baichuan2-7B-Chat'
- 'baichuan-inc/Baichuan2-13B-Chat' - 'baichuan-inc/Baichuan2-13B-Chat'
- 'Qwen/Qwen-14B-Chat' - 'Qwen/Qwen-14B-Chat'
local_model_hub: '/models' local_model_hub: '/mnt/disk1/models'
warm_up: 3 warm_up: 3
num_trials: 50 num_trials: 50
num_beams: 1 # default to greedy search num_beams: 1 # default to greedy search

View file

@ -49,16 +49,16 @@ class TestTransformersAPI(unittest.TestCase):
print('Prompt:', input_str) print('Prompt:', input_str)
print('Output:', output_str) print('Output:', output_str)
print(f'Inference time: {end-st} s') print(f'Inference time: {end-st} s')
res = 'Paris' in output_str res = 'Paris' in output_str
self.assertTrue(res) self.assertTrue(res)
def test_transformers_auto_model_for_causal_lm_int4(self): def test_transformers_auto_model_for_causal_lm_int4(self):
model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH') model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH')
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
input_str = 'def hello():\n print("hello world")\n' input_str = 'def hello():\n print("hello world")\n'
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
with torch.inference_mode(): with torch.inference_mode():
st = time.time() st = time.time()
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
output = model.generate(input_ids, do_sample=False, max_new_tokens=32) output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
@ -67,7 +67,7 @@ class TestTransformersAPI(unittest.TestCase):
print('Prompt:', input_str) print('Prompt:', input_str)
print('Output:', output_str) print('Output:', output_str)
print(f'Inference time: {end-st} s') print(f'Inference time: {end-st} s')
res = '\nhello()' in output_str res = '\nhello()' in output_str
self.assertTrue(res) self.assertTrue(res)
@ -86,7 +86,7 @@ class TestTransformersAPI(unittest.TestCase):
predicted_ids = model.generate(input_features) predicted_ids = model.generate(input_features)
# decode token ids to text # decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
end = time.time() end = time.time()
print('Output:', transcription) print('Output:', transcription)
print(f'Inference time: {end-st} s') print(f'Inference time: {end-st} s')
res = 'Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.' in transcription[0] res = 'Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.' in transcription[0]
@ -108,7 +108,7 @@ class TestTransformersAPI(unittest.TestCase):
print('Prompt:', input_str) print('Prompt:', input_str)
print('Output:', output_str) print('Output:', output_str)
print(f'Inference time: {end-st} s') print(f'Inference time: {end-st} s')
res = 'Paris' in output_str res = 'Paris' in output_str
self.assertTrue(res) self.assertTrue(res)
@pytest.mark.parametrize('prompt, answer', [ @pytest.mark.parametrize('prompt, answer', [
@ -116,6 +116,7 @@ class TestTransformersAPI(unittest.TestCase):
]) ])
@pytest.mark.parametrize('Model, Tokenizer, model_path',[ @pytest.mark.parametrize('Model, Tokenizer, model_path',[
(AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')), (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')),
(AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH')),
]) ])
def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer): def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
@ -123,7 +124,7 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
load_in_4bit=True, load_in_4bit=True,
optimize_model=True, optimize_model=True,
trust_remote_code=True) trust_remote_code=True)
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
model.save_low_bit(tempdir) model.save_low_bit(tempdir)
loaded_model = Model.load_low_bit(tempdir, loaded_model = Model.load_low_bit(tempdir,
@ -143,9 +144,10 @@ prompt = "Once upon a time, there existed a little girl who liked to have advent
(AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt), (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
(AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt), (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt),
(AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt), (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt),
(AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_REPLIT_CODE_PATH'), prompt) (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_CODESHELL_7B_PATH'), prompt),
(AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt)
]) ])
def test_optimize_model(Model, Tokenizer, model_path, prompt): def test_optimize_model(Model, Tokenizer, model_path, prompt):
tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
input_ids = tokenizer.encode(prompt, return_tensors="pt") input_ids = tokenizer.encode(prompt, return_tensors="pt")

View file

@ -1,80 +0,0 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import pytest
import tempfile
import torch
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
mistral_model_path = os.environ.get('MISTRAL_ORIGIN_PATH')
prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
@pytest.mark.parametrize("Model, Tokenizer, model_path, prompt", [
(AutoModelForCausalLM, AutoTokenizer, mistral_model_path, prompt)
])
def test_optimize_model(Model, Tokenizer, model_path, prompt):
tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
input_ids = tokenizer.encode(prompt, return_tensors="pt")
model = Model.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=False,
trust_remote_code=True)
logits_base_model = (model(input_ids)).logits
model = Model.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=True,
trust_remote_code=True)
logits_optimized_model = (model(input_ids)).logits
diff = abs(logits_base_model - logits_optimized_model).flatten()
assert any(diff) is False
@pytest.mark.parametrize('prompt, answer', [
('What is the capital of France?\n\n', 'Paris')
])
@pytest.mark.parametrize('Model, Tokenizer, model_path',[
(AutoModelForCausalLM, AutoTokenizer, mistral_model_path),
])
def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
model = Model.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=True,
trust_remote_code=True)
with tempfile.TemporaryDirectory() as tempdir:
model.save_low_bit(tempdir)
loaded_model = Model.load_low_bit(tempdir,
optimize_model=True,
trust_remote_code=True)
with torch.inference_mode():
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = loaded_model.generate(input_ids, max_new_tokens=32)
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
assert answer in output_str
if __name__ == '__main__':
pytest.main([__file__])

View file

@ -104,8 +104,8 @@ class Test_Optimize_Gpu_Model:
if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor): if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
# 'attn_output' is of type torch.Tensor. # 'attn_output' is of type torch.Tensor.
attn_output_diff.append(t1 - t2) attn_output_diff.append(t1 - t2)
else: elif isinstance(t1, tuple) and isinstance(t2, tuple):
# 'past_key_value'is of type tuple as default. # if 'past_key_value'is of type tuple
for i, (t3, t4) in enumerate(zip(t1, t2)): for i, (t3, t4) in enumerate(zip(t1, t2)):
if model.config.architectures[0] == "ChatGLMModel" and \ if model.config.architectures[0] == "ChatGLMModel" and \
hasattr(model.config, 'padded_vocab_size') and \ hasattr(model.config, 'padded_vocab_size') and \
@ -114,6 +114,10 @@ class Test_Optimize_Gpu_Model:
# We need to narrow it here. # We need to narrow it here.
t4 = t4[:, :, 15:17, :] t4 = t4[:, :, 15:17, :]
attn_output_diff.append(t3 - t4) attn_output_diff.append(t3 - t4)
else:
# if 'past_key_value'is of type Cache, get last layer cache pair (key, value)
attn_output_diff.append(t1[-1][0] - t2[-1][0])
attn_output_diff.append(t1[-1][1] - t2[-1][1])
max_diff_tensor = [torch.max(item).item() for item in attn_output_diff] max_diff_tensor = [torch.max(item).item() for item in attn_output_diff]
print(max_diff_tensor) print(max_diff_tensor)

View file

@ -96,9 +96,14 @@ class Test_Optimize_Gpu_Model:
for i, (t1, t2) in enumerate(zip(layer_tensor, opt_layer_tensor)): for i, (t1, t2) in enumerate(zip(layer_tensor, opt_layer_tensor)):
if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor): if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
MLP_output_diff.append(t1 - t2) MLP_output_diff.append(t1 - t2)
else: elif isinstance(t1, tuple) and isinstance(t2, tuple):
# if 'past_key_value'is of type tuple
for i, (t3, t4) in enumerate(zip(t1, t2)): for i, (t3, t4) in enumerate(zip(t1, t2)):
MLP_output_diff.append(t3 - t4) MLP_output_diff.append(t3 - t4)
else:
# if 'past_key_value'is of type Cache, get last layer cache pair (key, value)
MLP_output_diff.append(t1[-1][0] - t2[-1][0])
MLP_output_diff.append(t1[-1][1] - t2[-1][1])
max_diff_tensor = [torch.max(item).item() for item in MLP_output_diff] max_diff_tensor = [torch.max(item).item() for item in MLP_output_diff]
print(max_diff_tensor) print(max_diff_tensor)

View file

@ -38,7 +38,7 @@ import os
class Test_Langchain_Transformers_API(TestCase): class Test_Langchain_Transformers_API(TestCase):
def setUp(self): def setUp(self):
self.auto_model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH') self.auto_model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')
self.auto_causal_model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH') self.auto_causal_model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH')
self.llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH') self.llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH')
self.bloom_model_path = os.environ.get('BLOOM_ORIGIN_PATH') self.bloom_model_path = os.environ.get('BLOOM_ORIGIN_PATH')
thread_num = os.environ.get('THREAD_NUM') thread_num = os.environ.get('THREAD_NUM')
@ -79,12 +79,12 @@ class Test_Langchain_Transformers_API(TestCase):
def test_qa_chain(self): def test_qa_chain(self):
texts = ''' texts = '''
AI is a machines ability to perform the cognitive functions AI is a machines ability to perform the cognitive functions
we associate with human minds, such as perceiving, reasoning, we associate with human minds, such as perceiving, reasoning,
learning, interacting with an environment, problem solving, learning, interacting with an environment, problem solving,
and even exercising creativity. Youve probably interacted and even exercising creativity. Youve probably interacted
with AI even if you didnt realize itvoice assistants like Siri with AI even if you didnt realize itvoice assistants like Siri
and Alexa are founded on AI technology, as are some customer and Alexa are founded on AI technology, as are some customer
service chatbots that pop up to help you navigate websites. service chatbots that pop up to help you navigate websites.
''' '''
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
@ -102,16 +102,16 @@ class Test_Langchain_Transformers_API(TestCase):
res = "AI" in output res = "AI" in output
self.assertTrue(res) self.assertTrue(res)
""" """
def test_qa_chain_causalLM(self): def test_qa_chain_causalLM(self):
texts = ''' texts = '''
AI is a machines ability to perform the cognitive functions AI is a machines ability to perform the cognitive functions
we associate with human minds, such as perceiving, reasoning, we associate with human minds, such as perceiving, reasoning,
learning, interacting with an environment, problem solving, learning, interacting with an environment, problem solving,
and even exercising creativity. Youve probably interacted and even exercising creativity. Youve probably interacted
with AI even if you didnt realize itvoice assistants like Siri with AI even if you didnt realize itvoice assistants like Siri
and Alexa are founded on AI technology, as are some customer and Alexa are founded on AI technology, as are some customer
service chatbots that pop up to help you navigate websites. service chatbots that pop up to help you navigate websites.
''' '''
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
@ -129,7 +129,7 @@ class Test_Langchain_Transformers_API(TestCase):
res = "AI" in output res = "AI" in output
self.assertTrue(res) self.assertTrue(res)
""" """
def test_embed_kwargs(self): def test_embed_kwargs(self):
embeddings = TransformersEmbeddings.from_model_id(model_id=self.llama_model_path) embeddings = TransformersEmbeddings.from_model_id(model_id=self.llama_model_path)
encode_kwargs = {"truncation": True, "max_length": 512} encode_kwargs = {"truncation": True, "max_length": 512}

View file

@ -1,30 +0,0 @@
#!/bin/bash
export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference_gpu
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export DEVICE='xpu'
set -e
echo "# Start testing inference"
start=$(date "+%s")
# if [ -z "$THREAD_NUM" ]; then
# THREAD_NUM=2
# fi
# export OMP_NUM_THREADS=$THREAD_NUM
export BIGDL_LLM_XMX_DISABLED=1
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s -k "Mistral"
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s -k "Mistral"
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s -k "Mistral"
unset BIGDL_LLM_XMX_DISABLED
now=$(date "+%s")
time=$((now-start))
echo "Bigdl-llm gpu inference tests for transformers 4.34.0 finished"
echo "Time used:$time seconds"

View file

@ -21,9 +21,9 @@ pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v -s
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_layernorm.py -v -s pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_layernorm.py -v -s
export BIGDL_LLM_XMX_DISABLED=1 export BIGDL_LLM_XMX_DISABLED=1
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_final_logits.py -v -s pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_final_logits.py -v -s
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s -k "not Mistral" pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s -k "not Mistral" pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s -k "not Mistral" pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s
unset BIGDL_LLM_XMX_DISABLED unset BIGDL_LLM_XMX_DISABLED
now=$(date "+%s") now=$(date "+%s")

View file

@ -18,10 +18,6 @@ export OMP_NUM_THREADS=$THREAD_NUM
python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v
python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_optimize_model_api.py -v python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_optimize_model_api.py -v
python -m pip install transformers==4.34.0
python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformesr_api_434.py -v
python -m pip install transformers==4.31.0
now=$(date "+%s") now=$(date "+%s")
time=$((now-start)) time=$((now-start))