[LLM] Add perf test on xpu for bigdl-llm (#8866)

* add xpu latency job * update install way * remove duplicated workflow * add perf upload
2023-09-05 17:36:24 +09:00 · 2023-09-05 17:36:24 +09:00 · 704a896e90
commit 704a896e90
parent 95271f10e0
9 changed files with 5323 additions and 226 deletions
--- a/.github/actions/llm/setup-llm-env/action.yml
+++ b/.github/actions/llm/setup-llm-env/action.yml
@ -1,6 +1,10 @@
 name: "Setup BigDL-LLM Env"
 description: "BigDL-LLM installation"
-
+inputs:
+  extra-dependency:
+    description: "Name of extra dependencies filled in brackets"
+    required: false
+    default: "all"
 runs:
  using: "composite"
  steps:
@ -17,6 +21,11 @@ runs:
          exit 1
        fi
        whl_name=$(ls python/llm/dist)
-        pip install -i https://pypi.python.org/simple --force-reinstall "python/llm/dist/${whl_name}[all]"
-        pip install pytest
-        bash python/llm/test/run-llm-install-tests.sh
+        if [[ ${{ inputs.extra-dependency }} == 'xpu' ]]; then
+          pip install -i https://pypi.python.org/simple --force-reinstall "python/llm/dist/${whl_name}[xpu]" -f https://developer.intel.com/ipex-whl-stable-xpu
+          pip install pytest datasets librosa soundfile
+        else
+          pip install -i https://pypi.python.org/simple --force-reinstall "python/llm/dist/${whl_name}[all]"
+          pip install pytest
+          bash python/llm/test/run-llm-install-tests.sh
+        fi
--- a/.github/workflows/llm-nightly-test-windows.yml
+++ b/.github/workflows/llm-nightly-test-windows.yml
@ -1,126 +0,0 @@
-name: LLM Nightly Tests on Windows
-
-# Cancel previous runs in the PR when you push new commits
-concurrency:
-  group: ${{ github.workflow }}-llm-nightly-test-win-${{ github.event.pull_request.number || github.run_id }}
-  cancel-in-progress: true
-
-# Controls when the action will run.
-on:
-  # Triggers the workflow on push or pull request events but only for the main branch
-  push:
-    branches: [main]
-    paths:
-      - ".github/workflows/llm-nightly-test-windows.yml"
-  pull_request:
-    branches: [main]
-    paths:
-      - ".github/workflows/llm-nightly-test-windows.yml"
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  llm-cpp-build:
-    uses: ./.github/workflows/llm-binary-build.yml
-  llm-nightly-test-windows:
-    runs-on: ["self-hosted", "Windows"]
-    needs: llm-cpp-build
-    steps:
-      - name: Set model directories
-        shell: bash
-        run: |
-          echo "ORIGIN_DIR=${{ github.workspace }}/../llm/origin-models" >> "$GITHUB_ENV"
-          echo "INT4_CKPT_DIR=${{ github.workspace }}/../llm/converted-models" >> "$GITHUB_ENV"
-      - name: Create model directories
-        shell: bash
-        run: |
-          if [ ! -d $ORIGIN_DIR ]; then
-            mkdir -p $ORIGIN_DIR
-          fi
-          if [ ! -d $INT4_CKPT_DIR ]; then
-            mkdir -p $INT4_CKPT_DIR
-          fi
-      - name: Set environment variables
-        shell: bash
-        run: |
-          echo "LLAMA_ORIGIN_PATH=${ORIGIN_DIR}/gpt4all-7b-hf" >> "$GITHUB_ENV"
-          echo "GPTNEOX_ORIGIN_PATH=${ORIGIN_DIR}/gptneox-7b-redpajama-bf16" >> "$GITHUB_ENV"
-          echo "BLOOM_ORIGIN_PATH=${ORIGIN_DIR}/bloomz-7b1" >> "$GITHUB_ENV"
-          echo "STARCODER_ORIGIN_PATH=${ORIGIN_DIR}/gpt_bigcode-santacoder" >> "$GITHUB_ENV"
-
-          echo "LLAMA_INT4_CKPT_PATH=${INT4_CKPT_DIR}/bigdl_llm_llama_q4_0.bin" >> "$GITHUB_ENV"
-          echo "GPTNEOX_INT4_CKPT_PATH=${INT4_CKPT_DIR}/bigdl_llm_gptneox_q4_0.bin" >> "$GITHUB_ENV"
-          echo "BLOOM_INT4_CKPT_PATH=${INT4_CKPT_DIR}/bigdl_llm_bloom_q4_0.bin" >> "$GITHUB_ENV"
-          echo "STARCODER_INT4_CKPT_PATH=${INT4_CKPT_DIR}/bigdl_llm_starcoder_q4_0.bin" >> "$GITHUB_ENV"
-
-      - uses: actions/checkout@v3
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.9"
-      - name: Install dependencies
-        shell: bash
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install --upgrade setuptools==58.0.4
-          python -m pip install --upgrade wheel
-      - name: Download llm binary
-        uses: ./.github/actions/llm/download-llm-binary
-      - name: Install BigDL-llm
-        shell: bash
-        run: |
-          pip install requests
-          bash python/llm/dev/release_default_windows.sh default false
-          whl_name=$(ls python/llm/dist)
-          pip install -i https://pypi.python.org/simple --force-reinstall "python/llm/dist/${whl_name}[all]"
-          pip install pytest
-      - name: Download origin models
-        shell: bash
-        run: |
-          if [ ! -d $LLAMA_ORIGIN_PATH ]; then
-            echo "Directory $LLAMA_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/gpt4all-7b-hf -P $ORIGIN_DIR
-          fi
-          if [ ! -d $GPTNEOX_ORIGIN_PATH ]; then
-            echo "Directory $GPTNEOX_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/gptneox-7b-redpajama-bf16 -P $ORIGIN_DIR
-          fi
-          if [ ! -d $BLOOM_ORIGIN_PATH ]; then
-            echo "Directory $BLOOM_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/bloomz-7b1 -P $ORIGIN_DIR
-          fi
-          if [ ! -d $STARCODER_ORIGIN_PATH ]; then
-            echo "Directory $STARCODER_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/gpt_bigcode-santacoder -P $ORIGIN_DIR
-          fi
-
-          # if [ ! -e $LLAMA_INT4_CKPT_PATH ]; then
-          # echo "Directory $LLAMA_INT4_CKPT_PATH not found. Downloading from FTP server..."
-          #   wget --no-verbose $LLM_FTP_URL/${LLAMA_INT4_CKPT_PATH:2} -P $INT4_CKPT_DIR
-          # fi
-          # if [ ! -e $GPTNEOX_INT4_CKPT_PATH ]; then
-          #   echo "Directory $GPTNEOX_INT4_CKPT_PATH not found. Downloading from FTP server..."
-          #   wget --no-verbose $LLM_FTP_URL/${GPTNEOX_INT4_CKPT_PATH:2} -P $INT4_CKPT_DIR
-          # fi
-          # if [ ! -e $BLOOM_INT4_CKPT_PATH ]; then
-          #   echo "Directory $BLOOM_INT4_CKPT_PATH not found. Downloading from FTP server..."
-          #   wget --no-verbose $LLM_FTP_URL/${BLOOM_INT4_CKPT_PATH:2} -P $INT4_CKPT_DIR
-          # fi
-          # if [ ! -e $STARCODER_INT4_CKPT_PATH ]; then
-          #   echo "Directory $STARCODER_INT4_CKPT_PATH not found. Downloading from FTP server..."
-          #   wget --no-verbose $LLM_FTP_URL/${STARCODER_INT4_CKPT_PATH:2} -P $INT4_CKPT_DIR
-          # fi
-      - name: Test converting models
-        shell: bash
-        run: |
-          echo "Running the convert models tests..."
-          python -m pytest -s python/llm/test/convert/test_convert_model.py
-      - name: Test model inference
-        shell: bash
-        run: |
-          echo "Running the inference models tests..."
-          python -m pytest -s python/llm/test/inference/test_call_models.py
-      - name: Clean up environment
-        shell: bash
-        run: |
-          pip uninstall bigdl-llm -y
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@ -8,15 +8,16 @@ concurrency:
 # Controls when the action will run.
 on:
  schedule:
-    - cron: '00 13 * * *' # GMT time, 13:00 GMT == 21:00 China
+    - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China
  pull_request:
-    branches: [ main ]
+    branches: [main]
    paths:
-      - '.github/workflows/llm_performance_tests.yml'
-      - '.github/workflows/llm-binary-build.yml'
-      - '.github/actions/llm/setup-llm-env/action.yml'
-      - '.github/actions/llm/remove-llm-env/action.yml'
-      - '.github/actions/llm/download-llm-binary/action.yml'
+      - ".github/workflows/llm_performance_tests.yml"
+      - ".github/workflows/llm-binary-build.yml"
+      - ".github/actions/llm/setup-llm-env/action.yml"
+      - ".github/actions/llm/remove-llm-env/action.yml"
+      - ".github/actions/llm/download-llm-binary/action.yml"
+      - "python/llm/test/benchmark/**"
  workflow_dispatch:
  workflow_call:

@ -31,7 +32,7 @@ jobs:
      matrix:
        python-version: ["3.9"]
        instruction: ["AVX512"]
-    runs-on: [ self-hosted, llm, perf ]
+    runs-on: [self-hosted, llm, perf]
    env:
      THREAD_NUM: 24
    steps:
@ -57,10 +58,78 @@ jobs:
      - name: Run LLM Performance test
        env:
          ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
-        run:
-          bash python/llm/dev/benchmark/run-benchmark-tests.sh
+        run: bash python/llm/dev/benchmark/run-benchmark-tests.sh

      # - name: Clean up test environment
      #   uses: ./.github/actions/llm/remove-llm-env
      #   env:
      #     ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+  llm-performance-test-on-arc:
+    needs: llm-cpp-build
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9"]
+    runs-on: [self-hosted, llm, perf]
+    env:
+      OMP_NUM_THREADS: 16
+      THREAD_NUM: 16
+      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+    steps:
+      - name: Set model directories
+        shell: bash
+        run: |
+          echo "ORIGIN_DIR=/mnt/disk1/models" >> "$GITHUB_ENV"
+      - name: Set environment variables
+        shell: bash
+        run: |
+          echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
+          echo "LLAMA2_13B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-13b-chat-hf" >> "$GITHUB_ENV"
+          echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
+          echo "WHISPER_MEDIUM_ORIGIN_PATH=${ORIGIN_DIR}/whisper-medium" >> "$GITHUB_ENV"
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        shell: bash
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade setuptools
+          python -m pip install --upgrade wheel
+
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
+
+      - name: Run LLM install (all) test
+        uses: ./.github/actions/llm/setup-llm-env
+        with:
+          extra-dependency: "xpu"
+
+      - name: Test installed xpu version
+        shell: bash
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          bash python/llm/test/run-llm-install-tests.sh
+      - name: Test on xpu
+        shell: bash
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          export USE_XETLA=OFF
+          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+          cd python/llm/test/benchmark/gpu
+          export http_proxy=${HTTP_PROXY}
+          export https_proxy=${HTTPS_PROXY}
+          rm -rf test-result || true
+          mkdir test-result
+          taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_7B_ORIGIN_PATH}" --input-tokens=32 --max-new-tokens=32 > test-result/llama2_7b-32-32.log
+          taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_7B_ORIGIN_PATH}" --input-tokens=1024 --max-new-tokens=1024 > test-result/llama2_7b-1024-1024.log
+          taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_13B_ORIGIN_PATH}" --input-tokens=32 --max-new-tokens=32 > test-result/llama2_13b-32-32.log
+          taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_13B_ORIGIN_PATH}" --input-tokens=1024 --max-new-tokens=1024 > test-result/llama2_13b-1024-1024.log
+          taskset -c 0-$((THREAD_NUM - 1)) python chatglm2.py --model-dir="${CHATGLM2_6B_ORIGIN_PATH}" --input-tokens=32 --max-new-tokens=32 > test-result/chatglm2_6b-32-32.log
+          taskset -c 0-$((THREAD_NUM - 1)) python chatglm2.py --model-dir="${CHATGLM2_6B_ORIGIN_PATH}" --input-tokens=1024 --max-new-tokens=1024 > test-result/chatglm2_6b-1024-1024.log
+          taskset -c 0-$((THREAD_NUM - 1)) python whisper.py --model-dir="${WHISPER_MEDIUM_ORIGIN_PATH}" > test-result/whisper_medium-default-default.log
+          python ../analyze_log_dir.py --log-dir=./test-result --output-path=./xpu_latency.csv
+          timestamp=`date '+%Y%m%d'`
+          curl -T ./xpu_latency.csv ${LLM_FTP_URL}/llm/ggml-actions/perf/xpu_lantency_$timestamp.csv
--- a/.github/workflows/llm_unit_tests_windows.yml
+++ b/.github/workflows/llm_unit_tests_windows.yml
@ -1,84 +0,0 @@
-name: LLM Unit Tests on Windows
-
-# Cancel previous runs in the PR when you push new commits
-concurrency:
-  group: ${{ github.workflow }}-llm-win-unittest-${{ github.event.pull_request.number || github.run_id }}
-  cancel-in-progress: true
-
-# Controls when the action will run.
-on:
-  # Triggers the workflow on push or pull request events but only for the main branch
-  push:
-    branches: [main]
-    paths:
-      - "python/llm/**"
-      - ".github/workflows/llm_unit_tests_windows.yml"
-  pull_request:
-    branches: [main]
-    paths:
-      - "python/llm/**"
-      - ".github/workflows/llm_unit_tests_windows.yml"
-  workflow_dispatch:
-  workflow_call:
-
-# A workflow run is made up of one or more jobs that can run sequentially or in parallel
-jobs:
-  llm-cpp-build:
-    uses: ./.github/workflows/llm-binary-build.yml
-  llm-unit-test-windows:
-    runs-on: ${{ matrix.os }}
-    needs: llm-cpp-build
-    strategy:
-      fail-fast: false
-      matrix:
-        os: ["windows-latest"]
-        python-version: ["3.9"]
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install --upgrade setuptools==58.0.4
-          python -m pip install --upgrade wheel
-      - name: Download llm binary
-        uses: ./.github/actions/llm/download-llm-binary
-      - name: Run LLM-init test
-        shell: bash
-        run: |
-          pip install requests
-          bash python/llm/dev/release_default_windows.sh default false
-          pip install -i https://pypi.python.org/simple python/llm/dist/bigdl_llm*.whl
-        env:
-          ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
-
-      - name: Run LLM install (all) test
-        shell: bash
-        run: |
-          pip install requests
-          bash python/llm/dev/release_default_windows.sh default false
-          whl_name=$(ls python/llm/dist)
-          pip install -i https://pypi.python.org/simple "python/llm/dist/${whl_name}[all]"
-          pip install pytest
-          bash python/llm/test/run-llm-install-tests.sh
-        env:
-          ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
-
-      - name: Run LLM unittests
-        shell: bash
-        run: |
-          pip install pytest pydantic
-          pip install -U langchain==0.0.184
-          pip install -U chromadb==0.3.25
-          pip install -U typing_extensions==4.5.0
-          bash python/llm/test/run-llm-windows-tests.sh
-        env:
-          ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
-
-      - name: Clean up
-        shell: bash
-        run: |
-          rm -rf models
--- a/python/llm/test/benchmark/analyze_log_dir.py
+++ b/python/llm/test/benchmark/analyze_log_dir.py
@ -0,0 +1,64 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import csv
+import os
+import re
+import glob
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        'Process the logs of benchmark utils and output a csv file of performance data',
+        add_help=False)
+    parser.add_argument('-m', '--log-dir',  default="./", type=str)
+    parser.add_argument('--output-path',
+                        default="./model_latency.csv", type=str)
+    args = parser.parse_args()
+    print(args)
+    result_list = []
+    for filename in glob.glob(os.path.join(args.log_dir, '*')):
+        try:
+            basename = os.path.basename(filename)
+            name, _ = os.path.splitext(basename)
+            model_name, prompt_length, output_length = name.strip().split('-')
+            with open(filename, 'r', encoding='utf-8') as f:
+                log = f.read()
+            first_token_time_list = sorted(map(float,
+                                               re.findall(r'First token cost (.*?)s', log)))
+            rest_token_time_list = sorted(map(float,
+                                              re.findall(r'Rest tokens cost average (.*?)s', log)))
+            # For fairness, remove the fastest and slowest data
+            first_token_latency = sum(first_token_time_list[1:-1]
+                                      )/(len(first_token_time_list)-2)
+            rest_token_latency = sum(rest_token_time_list[1:-1]
+                                     )/(len(rest_token_time_list)-2)
+            result_list.append({
+                'model_name': model_name,
+                'prompt_length': prompt_length,
+                'output_length': output_length,
+                'first_token_latency': first_token_latency,
+                'rest_token_latency': rest_token_latency,
+            })
+        except Exception as e:
+            print(e.args)
+            continue
+
+    with open(args.output_path, 'w', encoding='utf-8', newline='') as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=result_list[0].keys())
+        writer.writeheader()
+        writer.writerows(result_list)
+    print('Log analysis finished!')
--- a/python/llm/test/benchmark/gpu/benchmark_util.py
+++ b/python/llm/test/benchmark/gpu/benchmark_util.py
--- a/python/llm/test/benchmark/gpu/chatglm2.py
+++ b/python/llm/test/benchmark/gpu/chatglm2.py
@ -0,0 +1,85 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This is to test latest ipex version, model is chatglm2
+import torch
+import os
+import time
+from bigdl.llm.transformers import AutoModel
+from transformers import AutoTokenizer
+import intel_extension_for_pytorch as ipex
+import numpy as np
+from itertools import chain
+import pathlib
+import argparse
+import json
+from benchmark_util import BenchmarkWrapper
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('OPT generation script', add_help=False)
+    parser.add_argument('-m', '--model-dir',
+                        default="/mnt/disk1/models/chatglm2-6b", type=str)
+    parser.add_argument('--input-tokens', default='32', type=str)
+    parser.add_argument('--max-new-tokens', default=32,
+                        type=int, help="output max new tokens")
+    args = parser.parse_args()
+    print(args)
+
+    prompt_dict = {
+        '32': "我总是在晚上失眠,这个症状已经持续很长时间,所以晚上睡不着到底应该怎么处理,请告诉我一些可行的建议与方法,越详细越好",
+        '1024': "It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill. Once a pasta is in vicinity of a plate. You get credits for other pastas your own pasta kill. Once a pasta is in vicinity of a plate.You get credits for other pastas your own pasta kill. Once a pasta is in vicinity of a plate.You get credits for"
+    }
+    if args.input_tokens in prompt_dict:
+        prompt = prompt_dict[args.input_tokens]
+    else:
+        prompt = args.input_tokens
+
+    print(f"Test {args.model_dir}...")
+    # load_in_4bit=True in bigdl.llm.transformers will convert
+    # the relevant layers in the model into int4 format
+    model = AutoModel.from_pretrained(
+        args.model_dir, load_in_4bit=True, optimize_model=False, trust_remote_code=True)
+    model = model.half().to('xpu')
+    model = BenchmarkWrapper(model)
+    print(model.dtype)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_dir, trust_remote_code=True)
+    inputs = tokenizer([prompt], return_tensors="pt").to('xpu')
+    print(inputs["input_ids"].shape)
+
+    total_list = []
+    e2e_time = []
+    with torch.inference_mode():
+        for i in range(10):
+            torch.xpu.synchronize()
+            st = time.time()
+            inputs = tokenizer([prompt], return_tensors="pt").to('xpu')
+            # print(inputs["input_ids"].shape)
+            # output = model.generate(**inputs, do_sample=False, temperature=0.9, max_new_tokens=32, token_latency=True)
+            output = model.generate(
+                **inputs, do_sample=False, temperature=0.9, max_new_tokens=args.max_new_tokens)
+            gen_ids = output[0]
+            gen_text = tokenizer.batch_decode(
+                gen_ids, skip_special_tokens=True)
+            torch.xpu.synchronize()
+            end = time.time()
+            e2e_time.append(end-st)
+
+    print('Prompt:', prompt)
+    print('Output:', gen_text)
+    print(f'Inference time: {end-st} s')
+    print(e2e_time)
--- a/python/llm/test/benchmark/gpu/llama2.py
+++ b/python/llm/test/benchmark/gpu/llama2.py
@ -0,0 +1,88 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import torch
+import os
+import time
+from transformers import LlamaTokenizer
+import intel_extension_for_pytorch as ipex
+from benchmark_util import BenchmarkWrapper
+from bigdl.llm.transformers import AutoModelForCausalLM
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('OPT generation script', add_help=False)
+    parser.add_argument('-m', '--model-dir',
+                        default="/mnt/disk1/models/Llama-2-7b-chat-hf", type=str)
+    parser.add_argument('--input-tokens', default='32', type=str)
+    parser.add_argument('--max-new-tokens', default=32,
+                        type=int, help="output max new tokens")
+    args = parser.parse_args()
+    print(args)
+
+    model_path = args.model_dir
+    print(f"Test {model_path}...")
+    # load_in_4bit=True in bigdl.llm.transformers will convert
+    # the relevant layers in the model into int4 format
+    llama_model = AutoModelForCausalLM.from_pretrained(
+        model_path, optimize_model=False, load_in_4bit=True)
+    # llama_model = model.AutoModelForCausalLM.from_pretrained(model_path)
+    llama_model = llama_model.half().to('xpu')
+    # llama_model = llama_model.to(memory_format=torch.channels_last)
+    # llama_model = ipex.optimize(llama_model, dtype=torch.float16)
+    # llama_model = ipex.optimize_transformers(llama_model, dtype=torch.float16)
+    print(llama_model.dtype)
+    llama_model = BenchmarkWrapper(llama_model)
+    tokenizer = LlamaTokenizer.from_pretrained(model_path)
+
+    prompt_dict = {
+        '32': "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun",
+        '1024': "It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill. Once a pasta is in vicinity of a plate"
+    }
+    if args.input_tokens in prompt_dict:
+        prompt = prompt_dict[args.input_tokens]
+    else:
+        prompt = args.input_tokens
+
+    # with torch.inference_mode():
+    with torch.inference_mode(), torch.autocast(device_type='xpu', enabled=True, dtype=torch.float16):
+        # warmup
+        torch.xpu.synchronize()
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
+        print("input length is: ", len((input_ids[0])))
+        output = llama_model.generate(
+            input_ids, do_sample=False, max_new_tokens=args.max_new_tokens)
+        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+        torch.xpu.synchronize()
+
+        e2e_time = []
+        for i in range(10):
+            st = time.time()
+            torch.xpu.synchronize()
+            input_ids = tokenizer.encode(
+                prompt, return_tensors="pt").to('xpu')
+            output = llama_model.generate(
+                input_ids, do_sample=False, max_new_tokens=args.max_new_tokens)
+            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+            torch.xpu.synchronize()
+            end = time.time()
+            e2e_time.append(end-st)
+
+    print('Prompt:', prompt)
+    print('Output:', output_str)
+    print(f'Inference time: {end-st} s')
+    print(e2e_time)
--- a/python/llm/test/benchmark/gpu/whisper.py
+++ b/python/llm/test/benchmark/gpu/whisper.py
@ -0,0 +1,75 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import intel_extension_for_pytorch as ipex
+from transformers import WhisperProcessor
+import torch
+import time
+from benchmark_util import BenchmarkWrapper
+from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
+from datasets import load_dataset, load_from_disk
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('OPT generation script', add_help=False)
+    parser.add_argument('-m', '--model-dir',
+                        default="/mnt/disk1/models/whisper-medium", type=str)
+    args = parser.parse_args()
+    print(args)
+
+    model_path = args.model_dir
+    dataset_path = "hf-internal-testing/librispeech_asr_dummy"
+
+    # load model and processor
+    ds = load_dataset(dataset_path, "clean", split="validation")
+    print("pass")
+    processor = WhisperProcessor.from_pretrained(model_path)
+    print("model loaded")
+    # load dummy dataset and read audio files
+    sample = ds[0]["audio"]
+    # for transformer == 4.30.2
+    input_features = processor(
+        sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
+
+    input_features = input_features.half().contiguous()
+    input_features = input_features.to('xpu')
+    print(input_features.shape)
+    print(input_features.is_contiguous())
+
+    # generate token ids
+    whisper = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_path, load_in_4bit=True, optimize_model=False)
+    whisper.config.forced_decoder_ids = None
+    whisper = whisper.half().to('xpu')
+    whisper = BenchmarkWrapper(whisper)
+
+    with torch.inference_mode():
+        e2e_time = []
+        for i in range(10):
+            torch.xpu.synchronize()
+            st = time.time()
+            predicted_ids = whisper.generate(input_features)
+            # print(len(predicted_ids[0]))
+            torch.xpu.synchronize()
+            output_str = processor.batch_decode(
+                predicted_ids, skip_special_tokens=True)
+            end = time.time()
+            e2e_time.append(end-st)
+
+        print(f'Inference time: {end-st} s')
+        print('Output:', output_str)
+        print(f'Inference time: {end-st} s')
+        print(e2e_time)