Update tests for transformers 4.36 (#10858)
* update unit test * update * update * update * update * update * fix gpu attention test * update * update * update * update * update * update * update example test * replace replit code * update * update * update * update * set safe_serialization false * perf test * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * delete * update * update * update * update * update * update * revert * update
This commit is contained in:
		
							parent
							
								
									1291165720
								
							
						
					
					
						commit
						0a06a6e1d4
					
				
					 28 changed files with 90 additions and 409 deletions
				
			
		
							
								
								
									
										6
									
								
								.github/workflows/llm-harness-evaluation.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.github/workflows/llm-harness-evaluation.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -164,12 +164,6 @@ jobs:
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          pip install --upgrade datasets==2.14.6 
 | 
					          pip install --upgrade datasets==2.14.6 
 | 
				
			||||||
          if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
 | 
					 | 
				
			||||||
            pip install --upgrade transformers==4.36
 | 
					 | 
				
			||||||
          else
 | 
					 | 
				
			||||||
            pip install --upgrade transformers==4.31
 | 
					 | 
				
			||||||
          fi
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Run harness
 | 
					      - name: Run harness
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										5
									
								
								.github/workflows/llm-ppl-evaluation.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.github/workflows/llm-ppl-evaluation.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -149,11 +149,6 @@ jobs:
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          pip install --upgrade datasets==2.14.6
 | 
					          pip install --upgrade datasets==2.14.6
 | 
				
			||||||
          if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
 | 
					 | 
				
			||||||
            pip install --upgrade transformers==4.36
 | 
					 | 
				
			||||||
          else
 | 
					 | 
				
			||||||
            pip install --upgrade transformers==4.31
 | 
					 | 
				
			||||||
          fi
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Run perplexity
 | 
					      - name: Run perplexity
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										167
									
								
								.github/workflows/llm_performance_tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										167
									
								
								.github/workflows/llm_performance_tests.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -87,12 +87,11 @@ jobs:
 | 
				
			||||||
          source /opt/intel/oneapi/setvars.sh
 | 
					          source /opt/intel/oneapi/setvars.sh
 | 
				
			||||||
          bash python/llm/test/run-llm-install-tests.sh
 | 
					          bash python/llm/test/run-llm-install-tests.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on xpu(transformers==4.31.0)
 | 
					      - name: Test on xpu(transformers==4.36.2)
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          date_for_test_version=$(date -d yesterday +%Y-%m-%d)
 | 
					          date_for_test_version=$(date -d yesterday +%Y-%m-%d)
 | 
				
			||||||
          sed -i "s/date.today()/\"$date_for_test_version\"/g" python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i "s/date.today()/\"$date_for_test_version\"/g" python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
 | 
					 | 
				
			||||||
          source /opt/intel/oneapi/setvars.sh
 | 
					          source /opt/intel/oneapi/setvars.sh
 | 
				
			||||||
          export USE_XETLA=OFF
 | 
					          export USE_XETLA=OFF
 | 
				
			||||||
          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
					          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
				
			||||||
| 
						 | 
					@ -104,20 +103,6 @@ jobs:
 | 
				
			||||||
          sed -i 's/{today}/{today}_test1/g' run.py
 | 
					          sed -i 's/{today}/{today}_test1/g' run.py
 | 
				
			||||||
          python run.py
 | 
					          python run.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on xpu(transformers==4.34.0)
 | 
					 | 
				
			||||||
        shell: bash
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          source /opt/intel/oneapi/setvars.sh
 | 
					 | 
				
			||||||
          export USE_XETLA=OFF
 | 
					 | 
				
			||||||
          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
					 | 
				
			||||||
          # upgrade transformers for model Mistral-7B-v0.1
 | 
					 | 
				
			||||||
          python -m pip install transformers==4.34.0
 | 
					 | 
				
			||||||
          cp python/llm/test/benchmark/arc-perf-transformers-434.yaml python/llm/dev/benchmark/all-in-one/config.yaml
 | 
					 | 
				
			||||||
          cd python/llm/dev/benchmark/all-in-one
 | 
					 | 
				
			||||||
          # change csv name
 | 
					 | 
				
			||||||
          sed -i 's/test1/test2/g' run.py
 | 
					 | 
				
			||||||
          python run.py
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Test on xpu(transformers==4.37.0)
 | 
					      - name: Test on xpu(transformers==4.37.0)
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
| 
						 | 
					@ -129,7 +114,7 @@ jobs:
 | 
				
			||||||
          cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml
 | 
					          cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml
 | 
				
			||||||
          cd python/llm/dev/benchmark/all-in-one
 | 
					          cd python/llm/dev/benchmark/all-in-one
 | 
				
			||||||
          # change csv name
 | 
					          # change csv name
 | 
				
			||||||
          sed -i 's/test2/test3/g' run.py
 | 
					          sed -i 's/test1/test2/g' run.py
 | 
				
			||||||
          python run.py
 | 
					          python run.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Concat csv and generate html
 | 
					      - name: Concat csv and generate html
 | 
				
			||||||
| 
						 | 
					@ -151,7 +136,7 @@ jobs:
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          cd python/llm/dev/benchmark/all-in-one
 | 
					          cd python/llm/dev/benchmark/all-in-one
 | 
				
			||||||
          python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml
 | 
					          python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml
 | 
				
			||||||
          python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-434.yaml
 | 
					          python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-437.yaml
 | 
				
			||||||
          find . -name "*test*.csv" -delete
 | 
					          find . -name "*test*.csv" -delete
 | 
				
			||||||
          if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then
 | 
					          if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then
 | 
				
			||||||
            curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/
 | 
					            curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/
 | 
				
			||||||
| 
						 | 
					@ -279,6 +264,7 @@ jobs:
 | 
				
			||||||
            exit 1
 | 
					            exit 1
 | 
				
			||||||
          fi
 | 
					          fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on core ${{ matrix.platform }}
 | 
					      - name: Test on core ${{ matrix.platform }}
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
| 
						 | 
					@ -325,8 +311,8 @@ jobs:
 | 
				
			||||||
      # - name: Prepare for install ipex-llm from source
 | 
					      # - name: Prepare for install ipex-llm from source
 | 
				
			||||||
      #   shell: bash
 | 
					      #   shell: bash
 | 
				
			||||||
      #   run: |
 | 
					      #   run: |
 | 
				
			||||||
      #     sed -i 's/"bigdl-core-xe-21==" + VERSION + "/"bigdl-core-xe-21/g' python/llm/setup.py
 | 
					      #     sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
 | 
				
			||||||
      #     sed -i 's/"bigdl-core-xe-21==" + VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
 | 
					      #     sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      # - name: Install ipex-llm and other related packages (install from source)
 | 
					      # - name: Install ipex-llm and other related packages (install from source)
 | 
				
			||||||
      #   shell: cmd
 | 
					      #   shell: cmd
 | 
				
			||||||
| 
						 | 
					@ -426,33 +412,10 @@ jobs:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          call conda deactivate
 | 
					          call conda deactivate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Prepare igpu perf test for Mistral (32-32)
 | 
					 | 
				
			||||||
        shell: bash
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_434.yaml
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Test on igpu for Mistral (32-32)
 | 
					 | 
				
			||||||
        shell: cmd
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          call conda activate igpu-perf
 | 
					 | 
				
			||||||
          pip install transformers==4.34.0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          set SYCL_CACHE_PERSISTENT=1
 | 
					 | 
				
			||||||
          set BIGDL_LLM_XMX_DISABLED=1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          cd python\llm\dev\benchmark\all-in-one
 | 
					 | 
				
			||||||
          move ..\..\..\test\benchmark\igpu-perf\32-32_434.yaml config.yaml
 | 
					 | 
				
			||||||
          set PYTHONIOENCODING=utf-8
 | 
					 | 
				
			||||||
          python run.py >> %CSV_SAVE_PATH%\32-32\log\%LOG_FILE% 2>&1
 | 
					 | 
				
			||||||
          if %ERRORLEVEL% neq 0 (exit /b 1)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          call conda deactivate
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Prepare igpu perf test for Qwen1.5 (32-32)
 | 
					      - name: Prepare igpu perf test for Qwen1.5 (32-32)
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_437.yaml
 | 
					          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_437.yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on igpu for Qwen1.5 (32-32)
 | 
					      - name: Test on igpu for Qwen1.5 (32-32)
 | 
				
			||||||
| 
						 | 
					@ -495,14 +458,14 @@ jobs:
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128.yaml
 | 
					          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128.yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on igpu (1024-128)
 | 
					      - name: Test on igpu (1024-128)
 | 
				
			||||||
        shell: cmd
 | 
					        shell: cmd
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          call conda activate igpu-perf
 | 
					          call conda activate igpu-perf
 | 
				
			||||||
          pip install transformers==4.31.0
 | 
					          pip install transformers==4.36.2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          set SYCL_CACHE_PERSISTENT=1
 | 
					          set SYCL_CACHE_PERSISTENT=1
 | 
				
			||||||
          set BIGDL_LLM_XMX_DISABLED=1
 | 
					          set BIGDL_LLM_XMX_DISABLED=1
 | 
				
			||||||
| 
						 | 
					@ -517,33 +480,10 @@ jobs:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          call conda deactivate
 | 
					          call conda deactivate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Prepare igpu perf test for Mistral (1024-128)
 | 
					 | 
				
			||||||
        shell: bash
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_434.yaml
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Test on igpu for Mistral (1024-128)
 | 
					 | 
				
			||||||
        shell: cmd
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          call conda activate igpu-perf
 | 
					 | 
				
			||||||
          pip install transformers==4.34.0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          set SYCL_CACHE_PERSISTENT=1
 | 
					 | 
				
			||||||
          set BIGDL_LLM_XMX_DISABLED=1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          cd python\llm\dev\benchmark\all-in-one
 | 
					 | 
				
			||||||
          move ..\..\..\test\benchmark\igpu-perf\1024-128_434.yaml config.yaml
 | 
					 | 
				
			||||||
          set PYTHONIOENCODING=utf-8
 | 
					 | 
				
			||||||
          python run.py >> %CSV_SAVE_PATH%\1024-128\log\%LOG_FILE% 2>&1
 | 
					 | 
				
			||||||
          if %ERRORLEVEL% neq 0 (exit /b 1)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          call conda deactivate
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Prepare igpu perf test for Qwen 1.5 (1024-128)
 | 
					      - name: Prepare igpu perf test for Qwen 1.5 (1024-128)
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_437.yaml
 | 
					          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_437.yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on igpu for Qwen 1.5 (1024-128)
 | 
					      - name: Test on igpu for Qwen 1.5 (1024-128)
 | 
				
			||||||
| 
						 | 
					@ -585,14 +525,14 @@ jobs:
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256.yaml
 | 
					          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256.yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on igpu (2048-256)
 | 
					      - name: Test on igpu (2048-256)
 | 
				
			||||||
        shell: cmd
 | 
					        shell: cmd
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          call conda activate igpu-perf
 | 
					          call conda activate igpu-perf
 | 
				
			||||||
          pip install transformers==4.31.0
 | 
					          pip install transformers==4.36.2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          set SYCL_CACHE_PERSISTENT=1
 | 
					          set SYCL_CACHE_PERSISTENT=1
 | 
				
			||||||
          set BIGDL_LLM_XMX_DISABLED=1
 | 
					          set BIGDL_LLM_XMX_DISABLED=1
 | 
				
			||||||
| 
						 | 
					@ -607,33 +547,10 @@ jobs:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          call conda deactivate
 | 
					          call conda deactivate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Prepare igpu perf test for Mistral (2048-256)
 | 
					 | 
				
			||||||
        shell: bash
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_434.yaml
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Test on igpu for Mistral (2048-256)
 | 
					 | 
				
			||||||
        shell: cmd
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          call conda activate igpu-perf
 | 
					 | 
				
			||||||
          pip install transformers==4.34.0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          set SYCL_CACHE_PERSISTENT=1
 | 
					 | 
				
			||||||
          set BIGDL_LLM_XMX_DISABLED=1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          cd python\llm\dev\benchmark\all-in-one
 | 
					 | 
				
			||||||
          move ..\..\..\test\benchmark\igpu-perf\2048-256_434.yaml config.yaml
 | 
					 | 
				
			||||||
          set PYTHONIOENCODING=utf-8
 | 
					 | 
				
			||||||
          python run.py >> %CSV_SAVE_PATH%\2048-256\log\%LOG_FILE% 2>&1
 | 
					 | 
				
			||||||
          if %ERRORLEVEL% neq 0 (exit /b 1)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          call conda deactivate
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Prepare igpu perf test for Qwen 1.5 (2048-256)
 | 
					      - name: Prepare igpu perf test for Qwen 1.5 (2048-256)
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_437.yaml
 | 
					          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_437.yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on igpu for Qwen 1.5 (2048-256)
 | 
					      - name: Test on igpu for Qwen 1.5 (2048-256)
 | 
				
			||||||
| 
						 | 
					@ -675,14 +592,14 @@ jobs:
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          sed -i 's/2048-256/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/2048-256/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
 | 
					          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on igpu (load_low_bit 1024-128)
 | 
					      - name: Test on igpu (load_low_bit 1024-128)
 | 
				
			||||||
        shell: cmd
 | 
					        shell: cmd
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          call conda activate igpu-perf
 | 
					          call conda activate igpu-perf
 | 
				
			||||||
          pip install transformers==4.31.0
 | 
					          pip install transformers==4.36.2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          set SYCL_CACHE_PERSISTENT=1
 | 
					          set SYCL_CACHE_PERSISTENT=1
 | 
				
			||||||
          set BIGDL_LLM_XMX_DISABLED=1
 | 
					          set BIGDL_LLM_XMX_DISABLED=1
 | 
				
			||||||
| 
						 | 
					@ -697,33 +614,10 @@ jobs:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          call conda deactivate
 | 
					          call conda deactivate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Prepare igpu perf test for Mistral (load_low_bit 1024-128)
 | 
					 | 
				
			||||||
        shell: bash
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Test on igpu for Mistral (load_low_bit 1024-128)
 | 
					 | 
				
			||||||
        shell: cmd
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          call conda activate igpu-perf
 | 
					 | 
				
			||||||
          pip install transformers==4.34.0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          set SYCL_CACHE_PERSISTENT=1
 | 
					 | 
				
			||||||
          set BIGDL_LLM_XMX_DISABLED=1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          cd python\llm\dev\benchmark\all-in-one
 | 
					 | 
				
			||||||
          move ..\..\..\test\benchmark\igpu-perf\1024-128_loadlowbit_434.yaml config.yaml
 | 
					 | 
				
			||||||
          set PYTHONIOENCODING=utf-8
 | 
					 | 
				
			||||||
          python run.py >> %CSV_SAVE_PATH%\1024-128_loadlowbit\log\%LOG_FILE% 2>&1
 | 
					 | 
				
			||||||
          if %ERRORLEVEL% neq 0 (exit /b 1)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          call conda deactivate
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Prepare igpu perf test for Qwen 1.5 (load_low_bit 1024-128)
 | 
					      - name: Prepare igpu perf test for Qwen 1.5 (load_low_bit 1024-128)
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_437.yaml
 | 
					          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_437.yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on igpu for Qwen 1.5 (load_low_bit 1024-128)
 | 
					      - name: Test on igpu for Qwen 1.5 (load_low_bit 1024-128)
 | 
				
			||||||
| 
						 | 
					@ -763,14 +657,14 @@ jobs:
 | 
				
			||||||
      - name: Prepare igpu perf test (int4+fp16 1024-128)
 | 
					      - name: Prepare igpu perf test (int4+fp16 1024-128)
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
 | 
					          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on igpu (int4+fp16 1024-128)
 | 
					      - name: Test on igpu (int4+fp16 1024-128)
 | 
				
			||||||
        shell: cmd
 | 
					        shell: cmd
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          call conda activate igpu-perf
 | 
					          call conda activate igpu-perf
 | 
				
			||||||
          pip install transformers==4.31.0
 | 
					          pip install transformers==4.36.2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          set SYCL_CACHE_PERSISTENT=1
 | 
					          set SYCL_CACHE_PERSISTENT=1
 | 
				
			||||||
          set BIGDL_LLM_XMX_DISABLED=1
 | 
					          set BIGDL_LLM_XMX_DISABLED=1
 | 
				
			||||||
| 
						 | 
					@ -785,33 +679,10 @@ jobs:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          call conda deactivate
 | 
					          call conda deactivate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Prepare igpu perf test for Mistral (int4+fp16 1024-128)
 | 
					 | 
				
			||||||
        shell: bash
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Test on igpu for Mistral (int4+fp16 1024-128)
 | 
					 | 
				
			||||||
        shell: cmd
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          call conda activate igpu-perf
 | 
					 | 
				
			||||||
          pip install transformers==4.34.0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          set SYCL_CACHE_PERSISTENT=1
 | 
					 | 
				
			||||||
          set BIGDL_LLM_XMX_DISABLED=1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          cd python\llm\dev\benchmark\all-in-one
 | 
					 | 
				
			||||||
          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_434.yaml config.yaml
 | 
					 | 
				
			||||||
          set PYTHONIOENCODING=utf-8
 | 
					 | 
				
			||||||
          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
 | 
					 | 
				
			||||||
          if %ERRORLEVEL% neq 0 (exit /b 1)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          call conda deactivate
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Prepare igpu perf test for Qwen 1.5 (int4+fp16 1024-128)
 | 
					      - name: Prepare igpu perf test for Qwen 1.5 (int4+fp16 1024-128)
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
					          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
 | 
				
			||||||
          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml
 | 
					          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Test on igpu for Qwen 1.5 (int4+fp16 1024-128)
 | 
					      - name: Test on igpu for Qwen 1.5 (int4+fp16 1024-128)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										17
									
								
								.github/workflows/llm_unit_tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										17
									
								
								.github/workflows/llm_unit_tests.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -99,7 +99,7 @@ jobs:
 | 
				
			||||||
          echo "LLAMA_ORIGIN_PATH=${ORIGIN_DIR}/llama-7b-hf" >> "$GITHUB_ENV"
 | 
					          echo "LLAMA_ORIGIN_PATH=${ORIGIN_DIR}/llama-7b-hf" >> "$GITHUB_ENV"
 | 
				
			||||||
          echo "BLOOM_ORIGIN_PATH=${ORIGIN_DIR}/bloom-7b1" >> "$GITHUB_ENV"
 | 
					          echo "BLOOM_ORIGIN_PATH=${ORIGIN_DIR}/bloom-7b1" >> "$GITHUB_ENV"
 | 
				
			||||||
          echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
 | 
					          echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
 | 
				
			||||||
          echo "ORIGINAL_REPLIT_CODE_PATH=${ORIGIN_DIR}/replit-code-v1-3b" >> "$GITHUB_ENV"
 | 
					          echo "ORIGINAL_CODESHELL_7B_PATH=${ORIGIN_DIR}/CodeShell-7B-Chat" >> "$GITHUB_ENV"
 | 
				
			||||||
          echo "ORIGINAL_WHISPER_TINY_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV"
 | 
					          echo "ORIGINAL_WHISPER_TINY_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV"
 | 
				
			||||||
          echo "MISTRAL_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-v0.1" >> "$GITHUB_ENV"
 | 
					          echo "MISTRAL_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-v0.1" >> "$GITHUB_ENV"
 | 
				
			||||||
          echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
 | 
					          echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
 | 
				
			||||||
| 
						 | 
					@ -160,10 +160,10 @@ jobs:
 | 
				
			||||||
            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR"
 | 
					            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR"
 | 
				
			||||||
            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
 | 
					            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
 | 
				
			||||||
          fi
 | 
					          fi
 | 
				
			||||||
          if [ ! -d $ORIGINAL_REPLIT_CODE_PATH ]; then
 | 
					          if [ ! -d $ORIGINAL_CODESHELL_7B_PATH ]; then
 | 
				
			||||||
            echo "Directory $ORIGINAL_REPLIT_CODE_PATH not found. Downloading from FTP server..."
 | 
					            echo "Directory $ORIGINAL_CODESHELL_7B_PATH not found. Downloading from FTP server..."
 | 
				
			||||||
            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR"
 | 
					            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/CodeShell-7B-Chat -P $ORIGIN_DIR"
 | 
				
			||||||
            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR
 | 
					            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/CodeShell-7B-Chat -P $ORIGIN_DIR
 | 
				
			||||||
          fi
 | 
					          fi
 | 
				
			||||||
          if [ ! -d $ORIGINAL_WHISPER_TINY_PATH ]; then
 | 
					          if [ ! -d $ORIGINAL_WHISPER_TINY_PATH ]; then
 | 
				
			||||||
            echo "Directory $ORIGINAL_WHISPER_TINY_PATH not found. Downloading from FTP server..."
 | 
					            echo "Directory $ORIGINAL_WHISPER_TINY_PATH not found. Downloading from FTP server..."
 | 
				
			||||||
| 
						 | 
					@ -226,7 +226,7 @@ jobs:
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface
 | 
					          pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface
 | 
				
			||||||
          pip install transformers==4.36.0
 | 
					          pip install transformers==4.36.2
 | 
				
			||||||
          pip install "pydantic>=2.0.0"
 | 
					          pip install "pydantic>=2.0.0"
 | 
				
			||||||
          bash python/llm/test/run-llm-llamaindex-tests.sh
 | 
					          bash python/llm/test/run-llm-llamaindex-tests.sh
 | 
				
			||||||
      - name: Run sentence-transformers uninstallation
 | 
					      - name: Run sentence-transformers uninstallation
 | 
				
			||||||
| 
						 | 
					@ -234,6 +234,7 @@ jobs:
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          pip uninstall sentence-transformers -y || true
 | 
					          pip uninstall sentence-transformers -y || true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  llm-unit-test-on-arc:
 | 
					  llm-unit-test-on-arc:
 | 
				
			||||||
    needs: [setup-python-version, llm-cpp-build]
 | 
					    needs: [setup-python-version, llm-cpp-build]
 | 
				
			||||||
    strategy:
 | 
					    strategy:
 | 
				
			||||||
| 
						 | 
					@ -364,8 +365,6 @@ jobs:
 | 
				
			||||||
          fi
 | 
					          fi
 | 
				
			||||||
          python -m pip install datasets librosa soundfile einops tiktoken transformers_stream_generator
 | 
					          python -m pip install datasets librosa soundfile einops tiktoken transformers_stream_generator
 | 
				
			||||||
          bash python/llm/test/run-llm-inference-tests-gpu.sh
 | 
					          bash python/llm/test/run-llm-inference-tests-gpu.sh
 | 
				
			||||||
          python -m pip install transformers==4.34.0 
 | 
					 | 
				
			||||||
          bash python/llm/test/run-llm-inference-tests-gpu-434.sh
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Run LLM example tests
 | 
					      - name: Run LLM example tests
 | 
				
			||||||
        shell: bash
 | 
					        shell: bash
 | 
				
			||||||
| 
						 | 
					@ -428,7 +427,7 @@ jobs:
 | 
				
			||||||
            pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
 | 
					            pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
 | 
				
			||||||
            source /home/arda/intel/oneapi/setvars.sh
 | 
					            source /home/arda/intel/oneapi/setvars.sh
 | 
				
			||||||
          fi
 | 
					          fi
 | 
				
			||||||
          pip install transformers==4.36.0
 | 
					          pip install transformers==4.36.2
 | 
				
			||||||
          pip install "pydantic>=2.0.0"
 | 
					          pip install "pydantic>=2.0.0"
 | 
				
			||||||
          bash python/llm/test/run-llm-llamaindex-tests-gpu.sh
 | 
					          bash python/llm/test/run-llm-llamaindex-tests-gpu.sh
 | 
				
			||||||
      - name: Run sentence-transformers uninstallation
 | 
					      - name: Run sentence-transformers uninstallation
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -53,7 +53,7 @@ libs_dir = os.path.join(llm_home, "ipex_llm", "libs")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpu_torch_version = ["torch==2.1.2+cpu;platform_system=='Linux'", "torch==2.1.2;platform_system=='Windows'"]
 | 
					cpu_torch_version = ["torch==2.1.2+cpu;platform_system=='Linux'", "torch==2.1.2;platform_system=='Windows'"]
 | 
				
			||||||
CONVERT_DEP = ['numpy == 1.26.4', # lastet 2.0.0b1 will cause error
 | 
					CONVERT_DEP = ['numpy == 1.26.4', # lastet 2.0.0b1 will cause error
 | 
				
			||||||
               'transformers == 4.31.0', 'sentencepiece', 'tokenizers == 0.13.3',
 | 
					               'transformers == 4.36.2', 'sentencepiece', 'tokenizers == 0.15.2',
 | 
				
			||||||
               # TODO: Support accelerate 0.22.0
 | 
					               # TODO: Support accelerate 0.22.0
 | 
				
			||||||
               'accelerate == 0.21.0', 'tabulate'] + cpu_torch_version
 | 
					               'accelerate == 0.21.0', 'tabulate'] + cpu_torch_version
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -279,11 +279,10 @@ def setup_package():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Add internal requires for llama-index
 | 
					    # Add internal requires for llama-index
 | 
				
			||||||
    llama_index_requires = copy.deepcopy(all_requires)
 | 
					    llama_index_requires = copy.deepcopy(all_requires)
 | 
				
			||||||
    for exclude_require in ['transformers == 4.31.0', 'tokenizers == 0.13.3'] + cpu_torch_version:
 | 
					    for exclude_require in cpu_torch_version:
 | 
				
			||||||
        llama_index_requires.remove(exclude_require)
 | 
					        llama_index_requires.remove(exclude_require)
 | 
				
			||||||
    llama_index_requires += ["setuptools<70.0.0"]
 | 
					    llama_index_requires += ["setuptools<70.0.0"]
 | 
				
			||||||
    llama_index_requires += ["torch<2.2.0",
 | 
					    llama_index_requires += ["torch<2.2.0",
 | 
				
			||||||
                             "transformers>=4.34.0,<4.39.0",
 | 
					 | 
				
			||||||
                             "sentence-transformers~=2.6.1"]
 | 
					                             "sentence-transformers~=2.6.1"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -47,7 +47,8 @@ def _save_low_bit(self, save_dir, *args, **kwargs):
 | 
				
			||||||
    if isinstance(self, PreTrainedModel):
 | 
					    if isinstance(self, PreTrainedModel):
 | 
				
			||||||
        # We borrowed this method to adapt to Transformer model cases
 | 
					        # We borrowed this method to adapt to Transformer model cases
 | 
				
			||||||
        # as much as possible, and later we may merge these two situations
 | 
					        # as much as possible, and later we may merge these two situations
 | 
				
			||||||
        self.save_pretrained(save_dir)
 | 
					        kwargs['safe_serialization'] = False
 | 
				
			||||||
 | 
					        self.save_pretrained(save_dir, *args, **kwargs)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        # TODO: For the lowbit model still larger than 8GB,
 | 
					        # TODO: For the lowbit model still larger than 8GB,
 | 
				
			||||||
        #       save it into shards.
 | 
					        #       save it into shards.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,13 +10,14 @@ repo_id:
 | 
				
			||||||
  - 'databricks/dolly-v1-6b'
 | 
					  - 'databricks/dolly-v1-6b'
 | 
				
			||||||
  - 'databricks/dolly-v2-7b'
 | 
					  - 'databricks/dolly-v2-7b'
 | 
				
			||||||
  - 'databricks/dolly-v2-12b'
 | 
					  - 'databricks/dolly-v2-12b'
 | 
				
			||||||
  - 'internlm/internlm-chat-7b-8k'
 | 
					  - 'internlm/internlm-chat-7b'
 | 
				
			||||||
  - 'Qwen/Qwen-7B-Chat'
 | 
					  - 'Qwen/Qwen-7B-Chat'
 | 
				
			||||||
  - 'BAAI/AquilaChat-7B'
 | 
					  - 'BAAI/AquilaChat-7B'
 | 
				
			||||||
  - 'baichuan-inc/Baichuan2-7B-Chat'
 | 
					  - 'baichuan-inc/Baichuan2-7B-Chat'
 | 
				
			||||||
  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
 | 
					  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
 | 
				
			||||||
  - 'bigscience/bloomz-7b1'
 | 
					  - 'bigscience/bloomz-7b1'
 | 
				
			||||||
  - 'fnlp/moss-moon-003-sft-4bit'
 | 
					#  - 'fnlp/moss-moon-003-sft-4bit' # moss-moon-003-sft cannot work on transformers 4.34+
 | 
				
			||||||
 | 
					  - 'mistralai/Mistral-7B-v0.1'
 | 
				
			||||||
local_model_hub: '/mnt/disk1/models'
 | 
					local_model_hub: '/mnt/disk1/models'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					@ -31,7 +32,7 @@ test_api:
 | 
				
			||||||
  - "transformer_int4_gpu"  # on Intel GPU
 | 
					  - "transformer_int4_gpu"  # on Intel GPU
 | 
				
			||||||
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 | 
					cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 | 
				
			||||||
exclude:
 | 
					exclude:
 | 
				
			||||||
  - 'fnlp/moss-moon-003-sft-4bit:1024'
 | 
					#  - 'fnlp/moss-moon-003-sft-4bit:1024'
 | 
				
			||||||
  - 'fnlp/moss-moon-003-sft-4bit:2048'
 | 
					#  - 'fnlp/moss-moon-003-sft-4bit:2048'
 | 
				
			||||||
  - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
 | 
					  - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
 | 
				
			||||||
  - 'bigscience/bloomz-7b1:2048'
 | 
					  - 'bigscience/bloomz-7b1:2048'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,16 +0,0 @@
 | 
				
			||||||
# For the models that require transformers 4.34.0
 | 
					 | 
				
			||||||
repo_id:
 | 
					 | 
				
			||||||
  - 'mistralai/Mistral-7B-v0.1'
 | 
					 | 
				
			||||||
local_model_hub: '/mnt/disk1/models'
 | 
					 | 
				
			||||||
warm_up: 1
 | 
					 | 
				
			||||||
num_trials: 3
 | 
					 | 
				
			||||||
num_beams: 1 # default to greedy search
 | 
					 | 
				
			||||||
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 | 
					 | 
				
			||||||
batch_size: 1 # default to 1
 | 
					 | 
				
			||||||
in_out_pairs:
 | 
					 | 
				
			||||||
  - '32-32'
 | 
					 | 
				
			||||||
  - '1024-128'
 | 
					 | 
				
			||||||
  - '2048-256'
 | 
					 | 
				
			||||||
test_api:
 | 
					 | 
				
			||||||
  - "transformer_int4_gpu"  # on Intel GPU
 | 
					 | 
				
			||||||
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -12,10 +12,11 @@ repo_id:
 | 
				
			||||||
  - 'WisdomShell/CodeShell-7B-Chat'
 | 
					  - 'WisdomShell/CodeShell-7B-Chat'
 | 
				
			||||||
  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
					  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
				
			||||||
  - 'mosaicml/mpt-7b-chat'
 | 
					  - 'mosaicml/mpt-7b-chat'
 | 
				
			||||||
  - 'liuhaotian/llava-v1.5-7b'
 | 
					#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
 | 
				
			||||||
  - 'RWKV/rwkv-4-world-7b'
 | 
					  - 'RWKV/rwkv-4-world-7b'
 | 
				
			||||||
  - 'RWKV/rwkv-5-world-7b'
 | 
					  - 'RWKV/rwkv-5-world-7b'
 | 
				
			||||||
  - 'IEITYuan/Yuan2-2B-hf'
 | 
					  - 'IEITYuan/Yuan2-2B-hf'
 | 
				
			||||||
 | 
					  - 'mistralai/Mistral-7B-Instruct-v0.1'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +0,0 @@
 | 
				
			||||||
repo_id:
 | 
					 | 
				
			||||||
  - 'mistralai/Mistral-7B-Instruct-v0.1'
 | 
					 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					 | 
				
			||||||
warm_up: 1
 | 
					 | 
				
			||||||
num_trials: 3
 | 
					 | 
				
			||||||
num_beams: 1 # default to greedy search
 | 
					 | 
				
			||||||
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 | 
					 | 
				
			||||||
batch_size: 1 # default to 1
 | 
					 | 
				
			||||||
in_out_pairs:
 | 
					 | 
				
			||||||
  - '1024-128'
 | 
					 | 
				
			||||||
test_api:
 | 
					 | 
				
			||||||
  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
 | 
					 | 
				
			||||||
cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -12,10 +12,11 @@ repo_id:
 | 
				
			||||||
  - 'WisdomShell/CodeShell-7B-Chat'
 | 
					  - 'WisdomShell/CodeShell-7B-Chat'
 | 
				
			||||||
  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
					  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
				
			||||||
  - 'mosaicml/mpt-7b-chat'
 | 
					  - 'mosaicml/mpt-7b-chat'
 | 
				
			||||||
  - 'liuhaotian/llava-v1.5-7b'
 | 
					#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
 | 
				
			||||||
  # - 'RWKV/rwkv-4-world-7b'
 | 
					  # - 'RWKV/rwkv-4-world-7b'
 | 
				
			||||||
  # - 'RWKV/rwkv-5-world-7b'
 | 
					  # - 'RWKV/rwkv-5-world-7b'
 | 
				
			||||||
  - 'IEITYuan/Yuan2-2B-hf'
 | 
					  - 'IEITYuan/Yuan2-2B-hf'
 | 
				
			||||||
 | 
					  - 'mistralai/Mistral-7B-Instruct-v0.1'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +0,0 @@
 | 
				
			||||||
repo_id:
 | 
					 | 
				
			||||||
  - 'mistralai/Mistral-7B-Instruct-v0.1'
 | 
					 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					 | 
				
			||||||
warm_up: 1
 | 
					 | 
				
			||||||
num_trials: 3
 | 
					 | 
				
			||||||
num_beams: 1 # default to greedy search
 | 
					 | 
				
			||||||
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 | 
					 | 
				
			||||||
batch_size: 1 # default to 1
 | 
					 | 
				
			||||||
in_out_pairs:
 | 
					 | 
				
			||||||
  - '1024-128'
 | 
					 | 
				
			||||||
test_api:
 | 
					 | 
				
			||||||
  - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
 | 
					 | 
				
			||||||
cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -12,10 +12,11 @@ repo_id:
 | 
				
			||||||
  - 'WisdomShell/CodeShell-7B-Chat'
 | 
					  - 'WisdomShell/CodeShell-7B-Chat'
 | 
				
			||||||
  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
					  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
				
			||||||
  - 'mosaicml/mpt-7b-chat'
 | 
					  - 'mosaicml/mpt-7b-chat'
 | 
				
			||||||
  - 'liuhaotian/llava-v1.5-7b'
 | 
					#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
 | 
				
			||||||
  - 'RWKV/rwkv-4-world-7b'
 | 
					  - 'RWKV/rwkv-4-world-7b'
 | 
				
			||||||
  - 'RWKV/rwkv-5-world-7b'
 | 
					  - 'RWKV/rwkv-5-world-7b'
 | 
				
			||||||
  - 'IEITYuan/Yuan2-2B-hf'
 | 
					  - 'IEITYuan/Yuan2-2B-hf'
 | 
				
			||||||
 | 
					  - 'mistralai/Mistral-7B-Instruct-v0.1'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +0,0 @@
 | 
				
			||||||
repo_id:
 | 
					 | 
				
			||||||
  - 'mistralai/Mistral-7B-Instruct-v0.1'
 | 
					 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					 | 
				
			||||||
warm_up: 1
 | 
					 | 
				
			||||||
num_trials: 3
 | 
					 | 
				
			||||||
num_beams: 1 # default to greedy search
 | 
					 | 
				
			||||||
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 | 
					 | 
				
			||||||
batch_size: 1 # default to 1
 | 
					 | 
				
			||||||
in_out_pairs:
 | 
					 | 
				
			||||||
  - '1024-128'
 | 
					 | 
				
			||||||
test_api:
 | 
					 | 
				
			||||||
  - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
 | 
					 | 
				
			||||||
cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -12,10 +12,11 @@ repo_id:
 | 
				
			||||||
  - 'WisdomShell/CodeShell-7B-Chat'
 | 
					  - 'WisdomShell/CodeShell-7B-Chat'
 | 
				
			||||||
  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
					  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
				
			||||||
  - 'mosaicml/mpt-7b-chat'
 | 
					  - 'mosaicml/mpt-7b-chat'
 | 
				
			||||||
  - 'liuhaotian/llava-v1.5-7b'
 | 
					#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
 | 
				
			||||||
  - 'RWKV/rwkv-4-world-7b'
 | 
					  - 'RWKV/rwkv-4-world-7b'
 | 
				
			||||||
  - 'RWKV/rwkv-5-world-7b'
 | 
					  - 'RWKV/rwkv-5-world-7b'
 | 
				
			||||||
  - 'IEITYuan/Yuan2-2B-hf'
 | 
					  - 'IEITYuan/Yuan2-2B-hf'
 | 
				
			||||||
 | 
					  - 'mistralai/Mistral-7B-Instruct-v0.1'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +0,0 @@
 | 
				
			||||||
repo_id:
 | 
					 | 
				
			||||||
  - 'mistralai/Mistral-7B-Instruct-v0.1'
 | 
					 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					 | 
				
			||||||
warm_up: 1
 | 
					 | 
				
			||||||
num_trials: 3
 | 
					 | 
				
			||||||
num_beams: 1 # default to greedy search
 | 
					 | 
				
			||||||
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 | 
					 | 
				
			||||||
batch_size: 1 # default to 1
 | 
					 | 
				
			||||||
in_out_pairs:
 | 
					 | 
				
			||||||
  - '2048-256'
 | 
					 | 
				
			||||||
test_api:
 | 
					 | 
				
			||||||
  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
 | 
					 | 
				
			||||||
cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -12,10 +12,11 @@ repo_id:
 | 
				
			||||||
  - 'WisdomShell/CodeShell-7B-Chat'
 | 
					  - 'WisdomShell/CodeShell-7B-Chat'
 | 
				
			||||||
  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
					  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
				
			||||||
  - 'mosaicml/mpt-7b-chat'
 | 
					  - 'mosaicml/mpt-7b-chat'
 | 
				
			||||||
  - 'liuhaotian/llava-v1.5-7b'
 | 
					#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
 | 
				
			||||||
  - 'RWKV/rwkv-4-world-7b'
 | 
					  - 'RWKV/rwkv-4-world-7b'
 | 
				
			||||||
  - 'RWKV/rwkv-5-world-7b'
 | 
					  - 'RWKV/rwkv-5-world-7b'
 | 
				
			||||||
  - 'IEITYuan/Yuan2-2B-hf'
 | 
					  - 'IEITYuan/Yuan2-2B-hf'
 | 
				
			||||||
 | 
					  - 'mistralai/Mistral-7B-Instruct-v0.1'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 3
 | 
					warm_up: 3
 | 
				
			||||||
num_trials: 5
 | 
					num_trials: 5
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +0,0 @@
 | 
				
			||||||
repo_id:
 | 
					 | 
				
			||||||
  - 'mistralai/Mistral-7B-Instruct-v0.1'
 | 
					 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					 | 
				
			||||||
warm_up: 3
 | 
					 | 
				
			||||||
num_trials: 5
 | 
					 | 
				
			||||||
num_beams: 1 # default to greedy search
 | 
					 | 
				
			||||||
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 | 
					 | 
				
			||||||
batch_size: 1 # default to 1
 | 
					 | 
				
			||||||
in_out_pairs:
 | 
					 | 
				
			||||||
  - '32-32'
 | 
					 | 
				
			||||||
test_api:
 | 
					 | 
				
			||||||
  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
 | 
					 | 
				
			||||||
cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,7 @@ repo_id:
 | 
				
			||||||
  - 'baichuan-inc/Baichuan2-7B-Chat'
 | 
					  - 'baichuan-inc/Baichuan2-7B-Chat'
 | 
				
			||||||
  - 'baichuan-inc/Baichuan2-13B-Chat'
 | 
					  - 'baichuan-inc/Baichuan2-13B-Chat'
 | 
				
			||||||
  - 'Qwen/Qwen-14B-Chat'
 | 
					  - 'Qwen/Qwen-14B-Chat'
 | 
				
			||||||
local_model_hub: '/models'
 | 
					local_model_hub: '/mnt/disk1/models'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
num_beams: 1 # default to greedy search
 | 
					num_beams: 1 # default to greedy search
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,7 @@ repo_id:
 | 
				
			||||||
  - 'baichuan-inc/Baichuan2-7B-Chat'
 | 
					  - 'baichuan-inc/Baichuan2-7B-Chat'
 | 
				
			||||||
  - 'baichuan-inc/Baichuan2-13B-Chat'
 | 
					  - 'baichuan-inc/Baichuan2-13B-Chat'
 | 
				
			||||||
  - 'Qwen/Qwen-14B-Chat'
 | 
					  - 'Qwen/Qwen-14B-Chat'
 | 
				
			||||||
local_model_hub: '/models'
 | 
					local_model_hub: '/mnt/disk1/models'
 | 
				
			||||||
warm_up: 3
 | 
					warm_up: 3
 | 
				
			||||||
num_trials: 50
 | 
					num_trials: 50
 | 
				
			||||||
num_beams: 1 # default to greedy search
 | 
					num_beams: 1 # default to greedy search
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -53,7 +53,7 @@ class TestTransformersAPI(unittest.TestCase):
 | 
				
			||||||
        self.assertTrue(res)
 | 
					        self.assertTrue(res)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_transformers_auto_model_for_causal_lm_int4(self):
 | 
					    def test_transformers_auto_model_for_causal_lm_int4(self):
 | 
				
			||||||
        model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH')
 | 
					        model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH')
 | 
				
			||||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
        input_str = 'def hello():\n  print("hello world")\n'
 | 
					        input_str = 'def hello():\n  print("hello world")\n'
 | 
				
			||||||
        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
 | 
				
			||||||
| 
						 | 
					@ -116,6 +116,7 @@ class TestTransformersAPI(unittest.TestCase):
 | 
				
			||||||
    ])
 | 
					    ])
 | 
				
			||||||
@pytest.mark.parametrize('Model, Tokenizer, model_path',[
 | 
					@pytest.mark.parametrize('Model, Tokenizer, model_path',[
 | 
				
			||||||
    (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')),
 | 
					    (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')),
 | 
				
			||||||
 | 
					    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH')),
 | 
				
			||||||
    ])
 | 
					    ])
 | 
				
			||||||
def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
 | 
					def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
 | 
				
			||||||
    tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					    tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
| 
						 | 
					@ -143,7 +144,8 @@ prompt = "Once upon a time, there existed a little girl who liked to have advent
 | 
				
			||||||
    (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
 | 
					    (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
 | 
				
			||||||
    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt),
 | 
					    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt),
 | 
				
			||||||
    (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt),
 | 
					    (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt),
 | 
				
			||||||
    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_REPLIT_CODE_PATH'), prompt)
 | 
					    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_CODESHELL_7B_PATH'), prompt),
 | 
				
			||||||
 | 
					    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt)
 | 
				
			||||||
])
 | 
					])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_optimize_model(Model, Tokenizer, model_path, prompt):
 | 
					def test_optimize_model(Model, Tokenizer, model_path, prompt):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,80 +0,0 @@
 | 
				
			||||||
#
 | 
					 | 
				
			||||||
# Copyright 2016 The BigDL Authors.
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
					 | 
				
			||||||
# you may not use this file except in compliance with the License.
 | 
					 | 
				
			||||||
# You may obtain a copy of the License at
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
# Unless required by applicable law or agreed to in writing, software
 | 
					 | 
				
			||||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
					 | 
				
			||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
					 | 
				
			||||||
# See the License for the specific language governing permissions and
 | 
					 | 
				
			||||||
# limitations under the License.
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
import pytest
 | 
					 | 
				
			||||||
import tempfile
 | 
					 | 
				
			||||||
import torch
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ipex_llm.transformers import AutoModelForCausalLM
 | 
					 | 
				
			||||||
from transformers import AutoTokenizer
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
mistral_model_path = os.environ.get('MISTRAL_ORIGIN_PATH')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.mark.parametrize("Model, Tokenizer, model_path, prompt", [
 | 
					 | 
				
			||||||
    (AutoModelForCausalLM, AutoTokenizer, mistral_model_path, prompt)
 | 
					 | 
				
			||||||
])
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
def test_optimize_model(Model, Tokenizer, model_path, prompt):
 | 
					 | 
				
			||||||
    tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					 | 
				
			||||||
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    model = Model.from_pretrained(model_path,
 | 
					 | 
				
			||||||
                                load_in_4bit=True,
 | 
					 | 
				
			||||||
                                optimize_model=False,
 | 
					 | 
				
			||||||
                                trust_remote_code=True)
 | 
					 | 
				
			||||||
    logits_base_model = (model(input_ids)).logits
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    model = Model.from_pretrained(model_path,
 | 
					 | 
				
			||||||
                                load_in_4bit=True,
 | 
					 | 
				
			||||||
                                optimize_model=True,
 | 
					 | 
				
			||||||
                                trust_remote_code=True)
 | 
					 | 
				
			||||||
    logits_optimized_model = (model(input_ids)).logits
 | 
					 | 
				
			||||||
    diff = abs(logits_base_model - logits_optimized_model).flatten()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    assert any(diff) is False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.mark.parametrize('prompt, answer', [
 | 
					 | 
				
			||||||
    ('What is the capital of France?\n\n', 'Paris')
 | 
					 | 
				
			||||||
    ])
 | 
					 | 
				
			||||||
@pytest.mark.parametrize('Model, Tokenizer, model_path',[
 | 
					 | 
				
			||||||
    (AutoModelForCausalLM, AutoTokenizer, mistral_model_path),
 | 
					 | 
				
			||||||
    ])
 | 
					 | 
				
			||||||
def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
 | 
					 | 
				
			||||||
    tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					 | 
				
			||||||
    model = Model.from_pretrained(model_path,
 | 
					 | 
				
			||||||
                                  load_in_4bit=True,
 | 
					 | 
				
			||||||
                                  optimize_model=True,
 | 
					 | 
				
			||||||
                                  trust_remote_code=True)
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    with tempfile.TemporaryDirectory() as tempdir:
 | 
					 | 
				
			||||||
        model.save_low_bit(tempdir)
 | 
					 | 
				
			||||||
        loaded_model = Model.load_low_bit(tempdir,
 | 
					 | 
				
			||||||
                                          optimize_model=True,
 | 
					 | 
				
			||||||
                                          trust_remote_code=True)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        with torch.inference_mode():
 | 
					 | 
				
			||||||
            input_ids = tokenizer.encode(prompt, return_tensors="pt")
 | 
					 | 
				
			||||||
            output = loaded_model.generate(input_ids, max_new_tokens=32)
 | 
					 | 
				
			||||||
            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            assert answer in output_str
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    pytest.main([__file__])
 | 
					 | 
				
			||||||
| 
						 | 
					@ -104,8 +104,8 @@ class Test_Optimize_Gpu_Model:
 | 
				
			||||||
                    if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
 | 
					                    if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
 | 
				
			||||||
                        # 'attn_output' is of type torch.Tensor.
 | 
					                        # 'attn_output' is of type torch.Tensor.
 | 
				
			||||||
                        attn_output_diff.append(t1 - t2)
 | 
					                        attn_output_diff.append(t1 - t2)
 | 
				
			||||||
                    else:
 | 
					                    elif isinstance(t1, tuple) and isinstance(t2, tuple):
 | 
				
			||||||
                        # 'past_key_value'is of type tuple as default.
 | 
					                        # if 'past_key_value'is of type tuple
 | 
				
			||||||
                        for i, (t3, t4) in enumerate(zip(t1, t2)):
 | 
					                        for i, (t3, t4) in enumerate(zip(t1, t2)):
 | 
				
			||||||
                            if model.config.architectures[0] == "ChatGLMModel" and \
 | 
					                            if model.config.architectures[0] == "ChatGLMModel" and \
 | 
				
			||||||
                                    hasattr(model.config, 'padded_vocab_size') and \
 | 
					                                    hasattr(model.config, 'padded_vocab_size') and \
 | 
				
			||||||
| 
						 | 
					@ -114,6 +114,10 @@ class Test_Optimize_Gpu_Model:
 | 
				
			||||||
                                # We need to narrow it here.
 | 
					                                # We need to narrow it here.
 | 
				
			||||||
                                t4 = t4[:, :, 15:17, :]
 | 
					                                t4 = t4[:, :, 15:17, :]
 | 
				
			||||||
                            attn_output_diff.append(t3 - t4)
 | 
					                            attn_output_diff.append(t3 - t4)
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        # if 'past_key_value'is of type Cache, get last layer cache pair (key, value)
 | 
				
			||||||
 | 
					                        attn_output_diff.append(t1[-1][0] - t2[-1][0])
 | 
				
			||||||
 | 
					                        attn_output_diff.append(t1[-1][1] - t2[-1][1])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            max_diff_tensor = [torch.max(item).item() for item in attn_output_diff]
 | 
					            max_diff_tensor = [torch.max(item).item() for item in attn_output_diff]
 | 
				
			||||||
            print(max_diff_tensor)
 | 
					            print(max_diff_tensor)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -96,9 +96,14 @@ class Test_Optimize_Gpu_Model:
 | 
				
			||||||
            for i, (t1, t2) in enumerate(zip(layer_tensor, opt_layer_tensor)):
 | 
					            for i, (t1, t2) in enumerate(zip(layer_tensor, opt_layer_tensor)):
 | 
				
			||||||
                if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
 | 
					                if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
 | 
				
			||||||
                    MLP_output_diff.append(t1 - t2)
 | 
					                    MLP_output_diff.append(t1 - t2)
 | 
				
			||||||
                else:
 | 
					                elif isinstance(t1, tuple) and isinstance(t2, tuple):
 | 
				
			||||||
 | 
					                    # if 'past_key_value'is of type tuple
 | 
				
			||||||
                    for i, (t3, t4) in enumerate(zip(t1, t2)):
 | 
					                    for i, (t3, t4) in enumerate(zip(t1, t2)):
 | 
				
			||||||
                        MLP_output_diff.append(t3 - t4)
 | 
					                        MLP_output_diff.append(t3 - t4)
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    # if 'past_key_value'is of type Cache, get last layer cache pair (key, value)
 | 
				
			||||||
 | 
					                    MLP_output_diff.append(t1[-1][0] - t2[-1][0])
 | 
				
			||||||
 | 
					                    MLP_output_diff.append(t1[-1][1] - t2[-1][1])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            max_diff_tensor = [torch.max(item).item() for item in MLP_output_diff]
 | 
					            max_diff_tensor = [torch.max(item).item() for item in MLP_output_diff]
 | 
				
			||||||
            print(max_diff_tensor)
 | 
					            print(max_diff_tensor)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,7 +38,7 @@ import os
 | 
				
			||||||
class Test_Langchain_Transformers_API(TestCase):
 | 
					class Test_Langchain_Transformers_API(TestCase):
 | 
				
			||||||
    def setUp(self):
 | 
					    def setUp(self):
 | 
				
			||||||
        self.auto_model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')
 | 
					        self.auto_model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')
 | 
				
			||||||
        self.auto_causal_model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH')
 | 
					        self.auto_causal_model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH')
 | 
				
			||||||
        self.llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH')
 | 
					        self.llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH')
 | 
				
			||||||
        self.bloom_model_path = os.environ.get('BLOOM_ORIGIN_PATH')
 | 
					        self.bloom_model_path = os.environ.get('BLOOM_ORIGIN_PATH')
 | 
				
			||||||
        thread_num = os.environ.get('THREAD_NUM')
 | 
					        thread_num = os.environ.get('THREAD_NUM')
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,30 +0,0 @@
 | 
				
			||||||
#!/bin/bash
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
 | 
					 | 
				
			||||||
export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
 | 
					 | 
				
			||||||
export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference_gpu
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
export USE_XETLA=OFF
 | 
					 | 
				
			||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
					 | 
				
			||||||
export DEVICE='xpu'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
set -e
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
echo "# Start testing inference"
 | 
					 | 
				
			||||||
start=$(date "+%s")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# if [ -z "$THREAD_NUM" ]; then
 | 
					 | 
				
			||||||
#   THREAD_NUM=2
 | 
					 | 
				
			||||||
# fi
 | 
					 | 
				
			||||||
# export OMP_NUM_THREADS=$THREAD_NUM
 | 
					 | 
				
			||||||
export BIGDL_LLM_XMX_DISABLED=1
 | 
					 | 
				
			||||||
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s -k "Mistral"
 | 
					 | 
				
			||||||
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s -k "Mistral"
 | 
					 | 
				
			||||||
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s -k "Mistral"
 | 
					 | 
				
			||||||
unset BIGDL_LLM_XMX_DISABLED
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
now=$(date "+%s")
 | 
					 | 
				
			||||||
time=$((now-start))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
echo "Bigdl-llm gpu inference tests for transformers 4.34.0 finished"
 | 
					 | 
				
			||||||
echo "Time used:$time seconds"
 | 
					 | 
				
			||||||
| 
						 | 
					@ -21,9 +21,9 @@ pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v -s
 | 
				
			||||||
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_layernorm.py -v -s
 | 
					pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_layernorm.py -v -s
 | 
				
			||||||
export BIGDL_LLM_XMX_DISABLED=1
 | 
					export BIGDL_LLM_XMX_DISABLED=1
 | 
				
			||||||
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_final_logits.py -v -s
 | 
					pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_final_logits.py -v -s
 | 
				
			||||||
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s -k "not Mistral"
 | 
					pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s
 | 
				
			||||||
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s -k "not Mistral"
 | 
					pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s
 | 
				
			||||||
pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s -k "not Mistral"
 | 
					pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s
 | 
				
			||||||
unset BIGDL_LLM_XMX_DISABLED
 | 
					unset BIGDL_LLM_XMX_DISABLED
 | 
				
			||||||
 | 
					
 | 
				
			||||||
now=$(date "+%s")
 | 
					now=$(date "+%s")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -18,10 +18,6 @@ export OMP_NUM_THREADS=$THREAD_NUM
 | 
				
			||||||
python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v
 | 
					python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v
 | 
				
			||||||
python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_optimize_model_api.py -v
 | 
					python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_optimize_model_api.py -v
 | 
				
			||||||
 | 
					
 | 
				
			||||||
python -m pip install transformers==4.34.0
 | 
					 | 
				
			||||||
python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformesr_api_434.py -v
 | 
					 | 
				
			||||||
python -m pip install transformers==4.31.0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
now=$(date "+%s")
 | 
					now=$(date "+%s")
 | 
				
			||||||
time=$((now-start))
 | 
					time=$((now-start))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue