diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
index 316c1e5f..5e7658ec 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
@@ -28,7 +28,7 @@ conda create -n llm python=3.9 # recommend to use Python 3.9
 conda activate llm
 
 pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option
-pip install transformers==4.34.0  # upgrade transformers
+pip install transformers==4.36.0  # upgrade transformers
 ```
 ### 2. Run
 After setting up the Python environment, you could run the example by following steps.
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
index adf42f5f..db888290 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
@@ -28,7 +28,7 @@ conda activate llm
 
 # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
-pip install transformers==4.34.0  # upgrade transformers
+pip install transformers==4.36.0  # upgrade transformers
 ```
 
 ### 2. Configures OneAPI environment variables
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py b/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py
index e54f7aa6..4ab221b8 100644
--- a/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py
+++ b/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py
@@ -17,7 +17,7 @@
 import os
 import torch
 from accelerate import init_empty_weights
-from accelerate.utils import set_module_tensor_to_device as fill_model
+from accelerate.utils import set_module_tensor_to_device
 from tempfile import NamedTemporaryFile
 from transformers import MixtralConfig, MixtralForCausalLM, LlamaTokenizer
 
@@ -53,21 +53,34 @@ def load_gguf_mixtral(loader: GGUFFileLoader, dtype: torch.dtype = torch.float):
     with init_empty_weights():
         model = MixtralForCausalLM(mixtral_config)
 
+    # define an operator function that passed to low-level gguf API
     def process_mixtral(name, tensor):
+        # prepare module's name in transformers
         module_name = get_mixtral_module_name(name)
+        # prepare module's weight in transformers
         if 'ffn_gate_inp' in name:
-            # gguf weight needs to reshape for ffn_gate_inp
-            fill_model(model,
-                       module_name,
-                       "cpu",
-                       tensor.reshape(num_local_experts, hidden_size),
-                       dtype=dtype)
-        else:
-            fill_model(model,
-                       module_name,
-                       "cpu",
-                       tensor,
-                       dtype=dtype)
+            tensor = tensor.reshape(num_local_experts, hidden_size)
+        elif name.endswith("attn_q.weight"):
+            head, hd_size = tensor.shape[0], tensor.shape[1:]
+            tensor = (tensor.reshape(n_head,
+                                     head // n_head // 2,
+                                     2,
+                                     *hd_size)
+                            .swapaxes(1, 2)
+                            .reshape(tensor.shape))
+        elif name.endswith("attn_k.weight"):
+            head, hd_size = tensor.shape[0], tensor.shape[1:]
+            tensor = (tensor.reshape(n_head_kv,
+                                     head // n_head_kv // 2,
+                                     2,
+                                     *hd_size)
+                            .swapaxes(1, 2)
+                            .reshape(tensor.shape))
+        set_module_tensor_to_device(model,
+                                    module_name,
+                                    "cpu",
+                                    tensor,
+                                    dtype=dtype)
 
     tensor_loader = loader.tensor_loader
     tensor_loader.load_while_process(process_mixtral)