diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
index 80f4a286..9eec31d5 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
@@ -61,6 +61,7 @@ In the example, several arguments can be passed to satisfy your requirements:
 - `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf`
 - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
+- `--low_bit`: use what low_bit to run, default is `sym_int4`.
 
 #### 2.4 Sample Output
 #### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
index db888290..b75bb179 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
@@ -57,6 +57,7 @@ In the example, several arguments can be passed to satisfy your requirements:
 - `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf`
 - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
+- `--low_bit`: use what low_bit to run, default is `sym_int4`.
 
 #### 3.4 Sample Output
 #### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
index af88b2cf..e8ee9c36 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
@@ -37,6 +37,8 @@ if __name__ == '__main__':
                         help='Prompt to infer')
     parser.add_argument('--n-predict', type=int, default=32,
                         help='Max tokens to predict')
+    parser.add_argument('--low_bit', type=str, default="sym_int4",
+                        help='what low_bit to run bigdl-llm')
 
     args = parser.parse_args()
 
diff --git a/python/llm/setup.py b/python/llm/setup.py
index 4ff4333d..94491423 100644
--- a/python/llm/setup.py
+++ b/python/llm/setup.py
@@ -311,7 +311,7 @@ def setup_package():
         packages=get_llm_packages(),
         package_dir={"": "src"},
         package_data={
-            "bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"]},
+            "bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"] + ["transformers/gguf/models/model_implement/*/*.json"]},
         include_package_data=True,
         entry_points={
             "console_scripts": [
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/api.py b/python/llm/src/bigdl/llm/transformers/gguf/api.py
index 4f835ff2..020a91ba 100644
--- a/python/llm/src/bigdl/llm/transformers/gguf/api.py
+++ b/python/llm/src/bigdl/llm/transformers/gguf/api.py
@@ -37,7 +37,6 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str =
     qtype = loader.config["general.file_type"]
 
     invalidInputError(qtype in qtype_map, f"Unsupported gguf quantize type: {qtype}")
-    low_bit = qtype_map.get(qtype, "sym_int4")
 
     with torch.no_grad():
         if model_family == "llama":
@@ -45,19 +44,19 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str =
             if "mixtral" in general_name:
                 # mixtral, which also enjoys a general architecture of llama
                 from .models.mixtral import load_gguf_mixtral
-                model, tokenizer = load_gguf_mixtral(loader, dtype)
+                model, tokenizer = load_gguf_mixtral(loader, dtype, low_bit)
             elif "mistral" in general_name:
                 from .models.mistral import load_gguf_mistral
-                model, tokenizer = load_gguf_mistral(loader, dtype)
+                model, tokenizer = load_gguf_mistral(loader, dtype, low_bit)
             elif "yuan" in general_name:
                 from .models.yuan2 import load_gguf_yuan
                 model, tokenizer = load_gguf_yuan(loader, dtype)
             else:
                 from .models.llama import load_gguf_llama
-                model, tokenizer = load_gguf_llama(loader, dtype)
+                model, tokenizer = load_gguf_llama(loader, dtype, low_bit)
         elif model_family == "baichuan":
             from .models.baichuan import load_gguf_baichuan
-            model, tokenizer = load_gguf_baichuan(loader, dtype)
+            model, tokenizer = load_gguf_baichuan(loader, dtype, low_bit)
         elif model_family == "bloom":
             from .models.bloom import load_gguf_bloom
             model, tokenizer = load_gguf_bloom(loader, dtype)