diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 888c7952..d74965d1 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -192,3 +192,78 @@ jobs:
           pip install -U pandas==2.0.3
           pip install -U typing_extensions==4.5.0
           bash python/llm/test/run-llm-langchain-tests.sh
+  llm-unit-test-on-arc:
+    needs: llm-cpp-build
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9"]
+    runs-on: [self-hosted, llm, perf]
+    env:
+      OMP_NUM_THREADS: 16
+      THREAD_NUM: 16
+      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+    steps:
+      - name: Set environment variables
+        shell: bash
+        run: |
+          echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
+          echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
+          echo "FALCON_7B_ORIGIN_PATH=${ORIGIN_DIR}/falcon-7b-instruct-with-patch" >> "$GITHUB_ENV"
+          echo "MPT_7B_ORIGIN_PATH=${ORIGIN_DIR}/mpt-7b-chat" >> "$GITHUB_ENV"
+
+      - name: Checkout repo
+        uses: actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        shell: bash
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade setuptools
+          python -m pip install --upgrade wheel
+
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
+
+      - name: Run LLM install (all) test
+        uses: ./.github/actions/llm/setup-llm-env
+        with:
+          extra-dependency: "xpu"
+
+      - name: Test installed xpu version
+        shell: bash
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          bash python/llm/test/run-llm-install-tests.sh
+
+      - name: Download LLMs
+        shell: bash
+        run: |
+          if [ ! -d $LLAMA2_7B_ORIGIN_PATH ]; then
+            echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR
+          fi
+          if [ ! -d $CHATGLM2_6B_ORIGIN_PATH ]; then
+            echo "Directory $CHATGLM2_6B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
+          fi
+          if [ ! -d $FALCON_7B_ORIGIN_PATH ]; then
+            echo "Directory $FALCON_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/falcon-7b-instruct-with-patch -P $ORIGIN_DIR
+          fi
+          if [ ! -d $MPT_7B_ORIGIN_PATH ]; then
+            echo "Directory $MPT_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/mpt-7b-chat -P $ORIGIN_DIR
+          fi
+          
+      - name: Run LLM inference test
+        shell: bash
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          python -m pip install expecttest
+          bash python/llm/test/run-llm-inference-tests-gpu.sh
diff --git a/python/llm/test/inference_gpu/test_transformers_api.py b/python/llm/test/inference_gpu/test_transformers_api.py
new file mode 100644
index 00000000..69f2578d
--- /dev/null
+++ b/python/llm/test/inference_gpu/test_transformers_api.py
@@ -0,0 +1,52 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import os
+import pytest
+
+from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
+from transformers import LlamaTokenizer, AutoTokenizer
+
+device = os.environ['DEVICE']
+print(f'Running on {device}')
+if device == 'xpu':
+    import intel_extension_for_pytorch as ipex
+
+@pytest.mark.parametrize('prompt, answer', [
+    ('What is the capital of France?\n\n','Paris')
+    ])
+@pytest.mark.parametrize('Model, Tokenizer, model_path',[
+    (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA2_7B_ORIGIN_PATH')),
+    (AutoModel, AutoTokenizer, os.environ.get('CHATGLM2_6B_ORIGIN_PATH')),
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('FALCON_7B_ORIGIN_PATH')),
+    ])
+def test_completion(Model, Tokenizer, model_path, prompt, answer):
+    tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model = Model.from_pretrained(model_path,
+                                load_in_4bit=True,
+                                optimize_model=True,
+                                trust_remote_code=True)
+    model = model.to(device)
+
+    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+    output = model.generate(input_ids, max_new_tokens=32)
+    output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+
+    assert answer in output_str
+        
+if __name__ == '__main__':
+    pytest.main([__file__])
diff --git a/python/llm/test/run-llm-inference-tests-gpu.sh b/python/llm/test/run-llm-inference-tests-gpu.sh
new file mode 100644
index 00000000..3d22c1cf
--- /dev/null
+++ b/python/llm/test/run-llm-inference-tests-gpu.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
+export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
+export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference_gpu
+
+export USE_XETLA=OFF
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export DEVICE='xpu'
+
+set -e
+
+echo "# Start testing inference"
+start=$(date "+%s")
+
+if [ -z "$THREAD_NUM" ]; then
+  THREAD_NUM=2
+fi
+export OMP_NUM_THREADS=$THREAD_NUM
+pytest ${LLM_INFERENCE_TEST_DIR} -v -s 
+
+now=$(date "+%s")
+time=$((now-start))
+
+echo "Bigdl-llm gpu tests finished"
+echo "Time used:$time seconds"