diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md index 316c1e5f..5e7658ec 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md @@ -28,7 +28,7 @@ conda create -n llm python=3.9 # recommend to use Python 3.9 conda activate llm pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option -pip install transformers==4.34.0 # upgrade transformers +pip install transformers==4.36.0 # upgrade transformers ``` ### 2. Run After setting up the Python environment, you could run the example by following steps. diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md index adf42f5f..db888290 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md @@ -28,7 +28,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu -pip install transformers==4.34.0 # upgrade transformers +pip install transformers==4.36.0 # upgrade transformers ``` ### 2. Configures OneAPI environment variables diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py b/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py index e54f7aa6..4ab221b8 100644 --- a/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py +++ b/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py @@ -17,7 +17,7 @@ import os import torch from accelerate import init_empty_weights -from accelerate.utils import set_module_tensor_to_device as fill_model +from accelerate.utils import set_module_tensor_to_device from tempfile import NamedTemporaryFile from transformers import MixtralConfig, MixtralForCausalLM, LlamaTokenizer @@ -53,21 +53,34 @@ def load_gguf_mixtral(loader: GGUFFileLoader, dtype: torch.dtype = torch.float): with init_empty_weights(): model = MixtralForCausalLM(mixtral_config) + # define an operator function that passed to low-level gguf API def process_mixtral(name, tensor): + # prepare module's name in transformers module_name = get_mixtral_module_name(name) + # prepare module's weight in transformers if 'ffn_gate_inp' in name: - # gguf weight needs to reshape for ffn_gate_inp - fill_model(model, - module_name, - "cpu", - tensor.reshape(num_local_experts, hidden_size), - dtype=dtype) - else: - fill_model(model, - module_name, - "cpu", - tensor, - dtype=dtype) + tensor = tensor.reshape(num_local_experts, hidden_size) + elif name.endswith("attn_q.weight"): + head, hd_size = tensor.shape[0], tensor.shape[1:] + tensor = (tensor.reshape(n_head, + head // n_head // 2, + 2, + *hd_size) + .swapaxes(1, 2) + .reshape(tensor.shape)) + elif name.endswith("attn_k.weight"): + head, hd_size = tensor.shape[0], tensor.shape[1:] + tensor = (tensor.reshape(n_head_kv, + head // n_head_kv // 2, + 2, + *hd_size) + .swapaxes(1, 2) + .reshape(tensor.shape)) + set_module_tensor_to_device(model, + module_name, + "cpu", + tensor, + dtype=dtype) tensor_loader = loader.tensor_loader tensor_loader.load_while_process(process_mixtral)