diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/README.md b/python/llm/example/GPU/Deepspeed-AutoTP/README.md
index 94757822..346bc8fc 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/README.md
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/README.md
@@ -37,7 +37,7 @@ We provide example usages on different models and different hardwares as followi
 bash run_llama2_70b_pvc_1550_1_card.sh
 ```
 
-> **Note**: You could change `ZE_AFFINITY_MASK` and `NUM_GPUS` according to your requirements.
+> **Note**: You could change `ZE_AFFINITY_MASK` and `NUM_GPUS` according to your requirements. And you could also specify other low bit optimizations through `--low-bit`.
 
 - Run Vicuna-33B on two Intel Arc A770
 
@@ -45,7 +45,7 @@ bash run_llama2_70b_pvc_1550_1_card.sh
 bash run_vicuna_33b_arc_2_card.sh
 ```
 
-> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine.
+> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine. And you could also specify other low bit optimizations through `--low-bit`.
 
 ### 3. Sample Output
 
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
index f69c66e0..54994c0f 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -55,9 +55,12 @@ if __name__ == '__main__':
                         help='Prompt to infer')
     parser.add_argument('--n-predict', type=int, default=32,
                         help='Max tokens to predict')
+    parser.add_argument('--low-bit', type=str, default='sym_int4',
+                        help='The quantization type the model will convert to.')
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
+    low_bit = args.low_bit
 
     # First use CPU as accelerator
     # Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage
@@ -79,7 +82,7 @@ if __name__ == '__main__':
 
     # Use bigdl-llm `optimize_model` to convert the model into optimized low bit format
     # Convert the rest of the model into float16 to reduce allreduce traffic
-    model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4').to(torch.float16)
+    model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16)
 
     # Next, use XPU as accelerator to speed up inference
     current_accel = XPU_Accelerator()
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
index 380e1a58..4e968541 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
@@ -30,4 +30,4 @@ export OMP_NUM_THREADS=$((56/$NUM_GPUS))
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 export TORCH_LLM_ALLREDUCE=1
 mpirun -np $NUM_GPUS --prepend-rank \
-    python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf'
+    python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh
index ca0697a6..39b060ae 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh
@@ -30,4 +30,4 @@ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 export TORCH_LLM_ALLREDUCE=0 # Different from PVC
 
 mpirun -np $NUM_GPUS --prepend-rank \
-    python deepspeed_autotp.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3'
+    python deepspeed_autotp.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --low-bit 'sym_int4'