diff --git a/python/llm/example/CPU/vLLM-Serving/README.md b/python/llm/example/CPU/vLLM-Serving/README.md
index 1ff3161e..8d650d8a 100644
--- a/python/llm/example/CPU/vLLM-Serving/README.md
+++ b/python/llm/example/CPU/vLLM-Serving/README.md
@@ -21,8 +21,6 @@ pip3 install numpy
 pip3 install --pre --upgrade bigdl-llm[all]
 pip3 install psutil
 pip3 install sentencepiece  # Required for LLaMA tokenizer.
-pip3 install "torch==2.0.1"
-pip3 install "transformers>=4.33.1"  # Required for Code Llama.
 pip3 install fastapi
 pip3 install "uvicorn[standard]"
 pip3 install "pydantic<2"  # Required for OpenAI server.
@@ -44,6 +42,7 @@ To run offline inference using vLLM for a quick impression, use the following ex
 #!/bin/bash
 
 # Please first modify the MODEL_PATH in offline_inference.py
+# Modify load_in_low_bit to use different quantization dtype
 
 numactl -C 48-95 -m 1 python offline_inference.py
 
@@ -60,6 +59,7 @@ To fully utilize the continuous batching feature of the `vLLM`, you can send req
 numactl -C 48-95 -m 1 python -m bigdl.llm.vllm.entrypoints.openai.api_server \
         --model /MODEL_PATH/Llama-2-7b-chat-hf-bigdl/ --port 8000  \
         --load-format 'auto' --device cpu --dtype bfloat16 \
+        --load-in-low-bit sym_int4 \
         --max-num-batched-tokens 4096
 ```
 
diff --git a/python/llm/example/CPU/vLLM-Serving/offline_inference.py b/python/llm/example/CPU/vLLM-Serving/offline_inference.py
index 45f4aa18..84ecb5a1 100644
--- a/python/llm/example/CPU/vLLM-Serving/offline_inference.py
+++ b/python/llm/example/CPU/vLLM-Serving/offline_inference.py
@@ -46,7 +46,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Create an LLM.
 # llm = LLM(model="facebook/opt-125m")
-llm = LLM(model="YOUR_MODEL_PATH", dtype="bfloat16")
+llm = LLM(model="YOUR_MODEL_PATH", load_in_low_bit="sym_int4")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
diff --git a/python/llm/example/GPU/vLLM-Serving/README.md b/python/llm/example/GPU/vLLM-Serving/README.md
index dabe161c..e09206b1 100644
--- a/python/llm/example/GPU/vLLM-Serving/README.md
+++ b/python/llm/example/GPU/vLLM-Serving/README.md
@@ -37,7 +37,6 @@ conda activate bigdl-vllm
 pip3 install psutil
 pip3 install sentencepiece  # Required for LLaMA tokenizer.
 pip3 install numpy
-pip3 install "transformers>=4.33.1"  # Required for Code Llama.
 # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 pip install --pre --upgrade "bigdl-llm[xpu]" -f https://developer.intel.com/ipex-whl-stable-xpu
 pip3 install fastapi
@@ -62,6 +61,7 @@ To run offline inference using vLLM for a quick impression, use the following ex
 #!/bin/bash
 
 # Please first modify the MODEL_PATH in offline_inference.py
+# Modify load_in_low_bit to use different quantization dtype
 python offline_inference.py
 ```
 
@@ -76,6 +76,7 @@ To fully utilize the continuous batching feature of the `vLLM`, you can send req
 python -m bigdl.llm.vllm.entrypoints.openai.api_server \
         --model /MODEL_PATH/Llama-2-7b-chat-hf/ --port 8000  \
         --load-format 'auto' --device xpu --dtype bfloat16 \
+        --load-in-low-bit sym_int4 \
         --max-num-batched-tokens 4096
 ```
 
diff --git a/python/llm/example/GPU/vLLM-Serving/offline_inference.py b/python/llm/example/GPU/vLLM-Serving/offline_inference.py
index 994781d6..f74dbcd0 100644
--- a/python/llm/example/GPU/vLLM-Serving/offline_inference.py
+++ b/python/llm/example/GPU/vLLM-Serving/offline_inference.py
@@ -46,7 +46,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Create an LLM.
 # llm = LLM(model="facebook/opt-125m")
-llm = LLM(model="YOUR_MODEL_PATH", dtype="bfloat16", device="xpu")
+llm = LLM(model="YOUR_MODEL_PATH", load_in_low_bit="sym_int4", dtype="bfloat16", device="xpu")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
diff --git a/python/llm/src/bigdl/llm/vllm/config.py b/python/llm/src/bigdl/llm/vllm/config.py
index 2285a63f..11c999c8 100644
--- a/python/llm/src/bigdl/llm/vllm/config.py
+++ b/python/llm/src/bigdl/llm/vllm/config.py
@@ -78,6 +78,7 @@ class ModelConfig:
             weights. If None, we assume the model weights are not quantized.
         device: The device to be used for the model. If None, we will default
             to use CPU as the device.
+        load_in_low_bit: The low-bit quantization for model to be loaded. Default int4.
     """
 
     def __init__(
@@ -95,6 +96,7 @@ class ModelConfig:
         max_model_len: Optional[int] = None,
         quantization: Optional[str] = None,
         device: Optional[str] = 'cpu',
+        load_in_low_bit: str = 'sym_int4',
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -107,6 +109,7 @@ class ModelConfig:
         self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
         self.device = device
+        self.load_in_low_bit = load_in_low_bit
 
         self.hf_config = get_config(model, trust_remote_code, revision)
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
diff --git a/python/llm/src/bigdl/llm/vllm/engine/arg_utils.py b/python/llm/src/bigdl/llm/vllm/engine/arg_utils.py
index c8eb5835..dc9857b1 100644
--- a/python/llm/src/bigdl/llm/vllm/engine/arg_utils.py
+++ b/python/llm/src/bigdl/llm/vllm/engine/arg_utils.py
@@ -71,6 +71,7 @@ class EngineArgs:
     # bigdl-llm change start
     # summary: add device option
     device: Optional[str] = 'cpu'
+    load_in_low_bit: str = 'sym_int4'
     # bigdl-llm change end
 
     def __post_init__(self):
@@ -212,6 +213,10 @@ class EngineArgs:
                             choices=['gpu', 'cpu', 'xpu', None],
                             default=None,
                             help='Device to execute LLM model')
+        parser.add_argument('--load-in-low-bit',
+                            type=str,
+                            default='sym_int4',
+                            help='low_bit_quantization')
 
         return parser
 
@@ -229,7 +234,7 @@ class EngineArgs:
                                    self.download_dir, self.load_format,
                                    self.dtype, self.seed, self.revision,
                                    self.tokenizer_revision, self.max_model_len,
-                                   self.quantization, self.device)
+                                   self.quantization, self.device, self.load_in_low_bit)
         scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
                                            self.max_num_seqs,
                                            model_config.max_model_len)
diff --git a/python/llm/src/bigdl/llm/vllm/engine/llm_engine.py b/python/llm/src/bigdl/llm/vllm/engine/llm_engine.py
index 15a03de4..d56ed482 100644
--- a/python/llm/src/bigdl/llm/vllm/engine/llm_engine.py
+++ b/python/llm/src/bigdl/llm/vllm/engine/llm_engine.py
@@ -119,7 +119,9 @@ class LLMEngine:
             f"load_format={model_config.load_format}, "
             # f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
             f"quantization={model_config.quantization}, "
-            f"seed={model_config.seed})"
+            f"seed={model_config.seed}), "
+            f"device={model_config.device}, "
+            f"load_in_low_bit={model_config.load_in_low_bit}"
         )
         # TODO(woosuk): Print more configs in debug mode.
 
diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/llm.py b/python/llm/src/bigdl/llm/vllm/entrypoints/llm.py
index ba6bae85..8ed3c790 100644
--- a/python/llm/src/bigdl/llm/vllm/entrypoints/llm.py
+++ b/python/llm/src/bigdl/llm/vllm/entrypoints/llm.py
@@ -93,6 +93,7 @@ class LLM:
             Otherwise, too small values may cause out-of-memory (OOM) errors.
         device: The device to be used for the model. If None, we will default
             to use CPU as the device.
+        load_in_low_bit: The low-bit quantization for model to be loaded. Default int4.
     """
 
     def __init__(
@@ -112,6 +113,7 @@ class LLM:
         # bigdl-llm change start
         # summary: add device option
         device: Optional[str] = "cpu",
+        load_in_low_bit: str = "sym_int4",
         # bigdl-llm change end
         **kwargs,
     ) -> None:
@@ -134,6 +136,7 @@ class LLM:
             gpu_memory_utilization=gpu_memory_utilization,
             swap_space=swap_space,
             device=device,
+            load_in_low_bit=load_in_low_bit,
             **kwargs,
         )
         self.llm_engine = LLMEngine.from_engine_args(engine_args)
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/model_loader.py b/python/llm/src/bigdl/llm/vllm/model_executor/model_loader.py
index 5f6a5c09..6bb91bd4 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/model_loader.py
+++ b/python/llm/src/bigdl/llm/vllm/model_executor/model_loader.py
@@ -114,8 +114,10 @@ def get_model(model_config: ModelConfig) -> nn.Module:
         if model_class in _MODEL_CLASSES_SUPPORT_QUANTIZATION:
             model = model_class(model_config.hf_config, quant_config)
         else:
+            # TODO: change for other models
             model = model_class(model_config.hf_config, device=model_config.device,
-                                max_model_len=model_config.max_model_len)
+                                max_model_len=model_config.max_model_len,
+                                load_in_low_bit=model_config.load_in_low_bit)
         # Load the weights from the cached or downloaded files.
         model.load_weights(model_config.model, model_config.download_dir,
                            model_config.load_format, model_config.revision)
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_chatglm.py b/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_chatglm.py
index d06bb361..c20e57e5 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_chatglm.py
+++ b/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_chatglm.py
@@ -58,23 +58,28 @@ class BigDLChatGLMForCausalLM(BigDLModelForCausalLM):
         config,
         device: Optional[str] = None,
         max_model_len: Optional[int] = None,
+        load_in_low_bit: str = 'sym_int4'
     ):
         super().__init__(config, device, max_model_len)
         self.config = config
         # TODO(gc): later change this to a switch?
-        if True:
-            from bigdl.llm.transformers import AutoModelForCausalLM
-            from bigdl.llm import optimize_model
+        from bigdl.llm.transformers import AutoModelForCausalLM
+        torch_dtype = 'auto'
 
-        # low_bit = 'sym_int4'
+        if load_in_low_bit == 'bf16':
+            torch_dtype = torch.bfloat16
+        elif load_in_low_bit == 'fp16':
+            torch_dtype = torch.float16
         if device == 'cpu':
-            model = AutoModelForCausalLM.from_pretrained(
+            self.model = AutoModelForCausalLM.from_pretrained(
                 config._name_or_path,
+                load_in_low_bit=load_in_low_bit,
+                torch_dtype=torch_dtype,
                 low_cpu_mem_usage=True,
                 trust_remote_code=True,
                 use_cache=True,
             )
-            self.model = optimize_model(model)
+            # self.model = optimize_model(model)
             self.sampler = BigDLSampler(config.vocab_size, device)
         elif device == 'xpu':
             try:
@@ -83,10 +88,10 @@ class BigDLChatGLMForCausalLM(BigDLModelForCausalLM):
                 print("Intel Extension for PyTorch is not installed, \
                        but is required for xpu inference.")
 
-            low_bit = 'sym_int4'
             model = AutoModelForCausalLM.from_pretrained(
                 config._name_or_path,
-                load_in_low_bit=low_bit,
+                load_in_low_bit=load_in_low_bit,
+                torch_dtype=torch_dtype,
                 trust_remote_code=True,
                 optimize_model=True,
                 use_cache=True,
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_llama.py b/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_llama.py
index 32af5970..e5d66f37 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_llama.py
+++ b/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_llama.py
@@ -63,20 +63,31 @@ class BigDLLlamaForCausalLM(BigDLModelForCausalLM):
         config: LlamaConfig,
         device: Optional[str] = None,
         max_model_len: Optional[int] = None,
+        load_in_low_bit: str = 'sym_int4'
     ):
         super().__init__(config, device, max_model_len)
         self.config = config
         # Always enable bigdl-llm model
         from bigdl.llm.transformers import AutoModelForCausalLM
-        from bigdl.llm import optimize_model
+        # TODO: we will need to pass the argument through command line argument
+        # from bigdl.llm import optimize_model
+        torch_dtype = 'auto'
+
+        if load_in_low_bit == 'bf16':
+            torch_dtype = torch.bfloat16
+        elif load_in_low_bit == 'fp16':
+            torch_dtype = torch.float16
+        # bf16 will require to set torch_dtype to bf16
         if device == 'cpu':
-            model = AutoModelForCausalLM.from_pretrained(
+            self.model = AutoModelForCausalLM.from_pretrained(
                 config._name_or_path,
+                load_in_low_bit=load_in_low_bit,
+                torch_dtype=torch_dtype,
                 low_cpu_mem_usage=True,
                 trust_remote_code=True,
                 use_cache=True,
             )
-            self.model = optimize_model(model)
+            # self.model = optimize_model(model)
             self.sampler = BigDLSampler(config.vocab_size, device)
         elif device == 'xpu':
             try:
@@ -85,10 +96,10 @@ class BigDLLlamaForCausalLM(BigDLModelForCausalLM):
                 print("Intel Extension for PyTorch is not installed, \
                     but is required for xpu inference.")
 
-            low_bit = 'sym_int4'
             model = AutoModelForCausalLM.from_pretrained(
                 config._name_or_path,
-                load_in_low_bit=low_bit,
+                load_in_low_bit=load_in_low_bit,
+                torch_dtype=torch_dtype,
                 trust_remote_code=True,
                 use_cache=True,
             )
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mistral.py b/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mistral.py
index 4e4c4bcd..9bdf572a 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mistral.py
+++ b/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mistral.py
@@ -58,23 +58,31 @@ class BigDLMistralForCausalLM(BigDLModelForCausalLM):
         config,
         device: Optional[str] = None,
         max_model_len: Optional[int] = None,
+        load_in_low_bit: str = 'sym_int4'
     ):
         super().__init__(config, device, max_model_len)
         self.config = config
         # TODO(gc): later change this to a switch?
-        if True:
-            from bigdl.llm.transformers import AutoModelForCausalLM
-            from bigdl.llm import optimize_model
+        from bigdl.llm.transformers import AutoModelForCausalLM
+        # from bigdl.llm import optimize_model
+
+        torch_dtype = 'auto'
+
+        if load_in_low_bit == 'bf16':
+            torch_dtype = torch.bfloat16
+        elif load_in_low_bit == 'fp16':
+            torch_dtype = torch.float16
 
-        # low_bit = 'sym_int4'
         if device == 'cpu':
-            model = AutoModelForCausalLM.from_pretrained(
+            self.model = AutoModelForCausalLM.from_pretrained(
                 config._name_or_path,
+                load_in_low_bit=load_in_low_bit,
+                torch_dtype=torch_dtype,
                 low_cpu_mem_usage=True,
                 trust_remote_code=True,
                 use_cache=True,
             )
-            self.model = optimize_model(model)
+            # self.model = optimize_model(model)
             self.sampler = BigDLSampler(config.vocab_size, device)
         elif device == 'xpu':
             try:
@@ -83,10 +91,10 @@ class BigDLMistralForCausalLM(BigDLModelForCausalLM):
                 print("Intel Extension for PyTorch is not installed, \
                        but is required for xpu inference.")
 
-            low_bit = 'sym_int4'
             model = AutoModelForCausalLM.from_pretrained(
                 config._name_or_path,
-                load_in_low_bit=low_bit,
+                load_in_low_bit=load_in_low_bit,
+                torch_dtype=torch_dtype,
                 trust_remote_code=True,
                 optimize_model=True,
                 use_cache=True,
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mixtral.py b/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mixtral.py
index a300aeb2..c5be91c3 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mixtral.py
+++ b/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mixtral.py
@@ -58,23 +58,29 @@ class BigDLMixtralForCausalLM(BigDLModelForCausalLM):
         config,
         device: Optional[str] = None,
         max_model_len: Optional[int] = None,
+        load_in_low_bit: str = 'sym_int4'
     ):
         super().__init__(config, device, max_model_len)
         self.config = config
         # TODO(gc): later change this to a switch?
-        if True:
-            from bigdl.llm.transformers import AutoModelForCausalLM
-            from bigdl.llm import optimize_model
+        from bigdl.llm.transformers import AutoModelForCausalLM
+
+        torch_dtype = 'auto'
+
+        if load_in_low_bit == 'bf16':
+            torch_dtype = torch.bfloat16
+        elif load_in_low_bit == 'fp16':
+            torch_dtype = torch.float16
 
-        # low_bit = 'sym_int4'
         if device == 'cpu':
-            model = AutoModelForCausalLM.from_pretrained(
+            self.model = AutoModelForCausalLM.from_pretrained(
                 config._name_or_path,
+                load_in_low_bit=load_in_low_bit,
+                torch_dtype=torch_dtype,
                 low_cpu_mem_usage=True,
                 trust_remote_code=True,
                 use_cache=True,
             )
-            self.model = optimize_model(model)
             self.sampler = BigDLSampler(config.vocab_size, device)
         elif device == 'xpu':
             try:
@@ -83,10 +89,10 @@ class BigDLMixtralForCausalLM(BigDLModelForCausalLM):
                 print("Intel Extension for PyTorch is not installed, \
                        but is required for xpu inference.")
 
-            low_bit = 'sym_int4'
             model = AutoModelForCausalLM.from_pretrained(
                 config._name_or_path,
-                load_in_low_bit=low_bit,
+                load_in_low_bit=load_in_low_bit,
+                torch_dtype=torch_dtype,
                 trust_remote_code=True,
                 optimize_model=True,
                 use_cache=True,