LLM : Enable gguf float16 and Yuan2 model (#10372)

* enable float16 * add yun files * enable yun * enable set low_bit on yuan2 * update * update license * update generate * update readme * update python style * update
2024-03-13 10:19:18 +08:00 · 2024-03-13 10:19:18 +08:00 · 0193f29411
commit 0193f29411
parent f5d65203c0
10 changed files with 1345 additions and 7 deletions
--- a/python/llm/dev/test/lint-python
+++ b/python/llm/dev/test/lint-python
@ -21,7 +21,7 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 PYTHON_ROOT_DIR="$SCRIPT_DIR/.."
 echo $PYTHON_ROOT_DIR
 PATHS_TO_CHECK="$SCRIPT_DIR/../../src"
-PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/bigdl/llm/langchain/*"
+PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/bigdl/llm/langchain/*,$SCRIPT_DIR/../../src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/*"
 PEP8_REPORT_PATH="$PYTHON_ROOT_DIR/test/pep8-report.txt"
 PYLINT_REPORT_PATH="$PYTHON_ROOT_DIR/test/pylint-report.txt"
 PYLINT_INSTALL_INFO="$PYTHON_ROOT_DIR/test/pylint-info.txt"
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
@ -9,6 +9,7 @@ In this directory, you will find examples on how to load GGUF model into `bigdl-
 - [Bloomz-7b1-GGUF](https://huggingface.co/hzjane/bloomz-7b1-gguf)
 - [falcon-7b-quantized-gguf](https://huggingface.co/xaviviro/falcon-7b-quantized-gguf/tree/main)
 - [mpt-7b-chat-gguf](https://huggingface.co/maddes8cht/mosaicml-mpt-7b-chat-gguf/tree/main)
+- [Yuan2-2B-Februa-hf-GGUF](https://huggingface.co/IEITYuan/Yuan2-2B-Februa-hf-GGUF/tree/main)

 ## Requirements
 To run these examples with BigDL-LLM, we have some recommended requirements for your machine, please refer to [here](../../../README.md#system-support) for more information.
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
@ -36,13 +36,16 @@ if __name__ == '__main__':
                        help='Prompt to infer')
    parser.add_argument('--n-predict', type=int, default=32,
                        help='Max tokens to predict')
+    parser.add_argument('--low_bit', type=str, default="sym_int4",
+                        help='what low_bit to run bigdl-llm')
+

    args = parser.parse_args()

    model_path = args.model

    # Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
-    model, tokenizer = AutoModelForCausalLM.from_gguf(model_path)
+    model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)

    # Generate predicted tokens
    with torch.inference_mode():
--- a/python/llm/src/bigdl/llm/transformers/gguf/api.py
+++ b/python/llm/src/bigdl/llm/transformers/gguf/api.py
@ -19,6 +19,7 @@ from bigdl.llm.utils.common import invalidInputError


 qtype_map = {
+    1: "fp16",           # float16
    2: "sym_int4",      # q4_0
    3: "asym_int4",     # q4_1
    7: "sym_int8",      # q8_0
@ -27,7 +28,7 @@ qtype_map = {
 }


-def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
+def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str = "sym_int4"):
    from .gguf import GGUFFileLoader

    loader = GGUFFileLoader(fpath)
@ -48,6 +49,9 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
            elif "mistral" in general_name:
                from .models.mistral import load_gguf_mistral
                model, tokenizer = load_gguf_mistral(loader, dtype)
+            elif "yuan" in general_name:
+                from .models.yuan2 import load_gguf_yuan
+                model, tokenizer = load_gguf_yuan(loader, dtype)
            else:
                from .models.llama import load_gguf_llama
                model, tokenizer = load_gguf_llama(loader, dtype)
@ -66,4 +70,4 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
        else:
            invalidInputError(False, f"Unsupported model family: {model_family}")

-        return model, tokenizer, low_bit
+        return model, tokenizer
--- a/python/llm/src/bigdl/llm/transformers/gguf/gguf.py
+++ b/python/llm/src/bigdl/llm/transformers/gguf/gguf.py
@ -268,7 +268,7 @@ class GGUFTensorLoader:
        return tensor.view(torch.float)

    def convert_f16_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
-        return tensor.view(torch.half)
+        return tensor.view(torch.half).reshape(dims)

    def convert_q4_0_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
        # see https://github.com/ggerganov/llama.cpp/blob
--- a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/init.py
+++ b/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/init.py
@ -0,0 +1,15 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
--- a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py
+++ b/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py
@ -0,0 +1,63 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class YuanConfig(PretrainedConfig):
+    model_type = "yuan"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=135040,
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=24,
+        num_attention_heads=32,
+        hidden_act="silu",
+        model_max_length=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=77185,
+        bos_token_id=77185,
+        eos_token_id=77185,
+        tie_word_embeddings=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.model_max_length = model_max_length
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
--- a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py
+++ b/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py
--- a/python/llm/src/bigdl/llm/transformers/gguf/models/yuan2.py
+++ b/python/llm/src/bigdl/llm/transformers/gguf/models/yuan2.py
@ -0,0 +1,130 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import torch
+from accelerate import init_empty_weights
+from accelerate.utils import set_module_tensor_to_device
+from tempfile import NamedTemporaryFile
+from transformers import LlamaTokenizer
+
+from ..gguf import GGUFFileLoader
+from .model_implement.yuan2.yuan_hf_model import YuanForCausalLM
+from .model_implement.yuan2.configuration_yuan import YuanConfig
+
+
+def load_gguf_yuan(loader: GGUFFileLoader, dtype: torch.dtype = torch.float,
+                   low_bit='sym_int4'):
+    config = loader.config
+    yuan_config = YuanConfig(
+        vocab_size=len(config['tokenizer.ggml.tokens']),
+        hidden_size=config['llama.embedding_length'],
+        intermediate_size=config['llama.feed_forward_length'],
+        num_hidden_layers=config['llama.block_count'],
+        num_attention_heads=config['llama.attention.head_count'],
+        hidden_act="silu",
+        model_max_length=config['llama.context_length'],
+        rms_norm_eps=config['llama.attention.layer_norm_rms_epsilon'],
+        use_cache=True,
+        initializer_range=0.02,
+        pad_token_id=config['tokenizer.ggml.padding_token_id'],
+        bos_token_id=config['tokenizer.ggml.bos_token_id'],
+        eos_token_id=config['tokenizer.ggml.eos_token_id'],
+        eod_token=config['tokenizer.ggml.eos_token_id'],
+        eod_token_id=config['tokenizer.ggml.eos_token_id'],
+        sep_token=config['tokenizer.ggml.seperator_token_id'],
+        sep_token_id=config['tokenizer.ggml.seperator_token_id'],
+        mask_token_id=config['tokenizer.ggml.seperator_token_id'],
+        use_loss_mask=False,
+        dropout=0.1,
+        reset_attention_mask=True,
+        reset_position_ids=True,
+        max_position_embeddings=config['llama.context_length'],
+        causal_mask=True,
+        use_flash_attention=False,
+        pretraining_tp=1,
+    )
+
+    ckpt = loader.tensors(dtype)
+    n_head = config['llama.attention.head_count']
+    n_head_kv = config['llama.attention.head_count_kv']
+
+    state_dict = {}
+    state_dict['model.embed_tokens.weight'] = ckpt['token_embd.weight']
+    state_dict['model.norm.weight'] = ckpt['output_norm.weight']
+    state_dict['lm_head.weight'] = ckpt['output.weight']
+    for i in range(config['llama.block_count']):
+        state_dict[f'model.layers.{i}.self_attn.q_proj.weight'] = \
+            ckpt[f'blk.{i}.attn_q.weight']
+        state_dict[f'model.layers.{i}.self_attn.k_proj.weight'] = \
+            ckpt[f'blk.{i}.attn_k.weight']
+        state_dict[f'model.layers.{i}.self_attn.v_proj.weight'] = \
+            ckpt[f'blk.{i}.attn_v.weight']
+        state_dict[f'model.layers.{i}.self_attn.o_proj.weight'] = \
+            ckpt[f'blk.{i}.attn_output.weight']
+        state_dict[f'model.layers.{i}.mlp.gate_proj.weight'] = \
+            ckpt[f'blk.{i}.ffn_gate.weight']
+        state_dict[f'model.layers.{i}.mlp.up_proj.weight'] = \
+            ckpt[f'blk.{i}.ffn_up.weight']
+        state_dict[f'model.layers.{i}.mlp.down_proj.weight'] = \
+            ckpt[f'blk.{i}.ffn_down.weight']
+        state_dict[f'model.layers.{i}.input_layernorm.weight'] = \
+            ckpt[f'blk.{i}.attn_norm.weight']
+        state_dict[f'model.layers.{i}.post_attention_layernorm.weight'] = \
+            ckpt[f'blk.{i}.ffn_norm.weight']
+        state_dict[f'model.layers.{i}.self_attn.lf_gate.output_layernorm.weight'] = \
+            ckpt[f'blk.{i}.lf_output_norm.weight']
+        state_dict[f'model.layers.{i}.self_attn.lf_gate.conv1.weight'] = \
+            ckpt[f'blk.{i}.conv1.weight']
+        state_dict[f'model.layers.{i}.self_attn.lf_gate.conv2.weight'] = \
+            ckpt[f'blk.{i}.conv2.weight']
+        state_dict[f'model.layers.{i}.self_attn.lf_gate.conv1.bias'] = \
+            ckpt[f'blk.{i}.conv1.bias']
+        state_dict[f'model.layers.{i}.self_attn.lf_gate.conv2.bias'] = \
+            ckpt[f'blk.{i}.conv2.bias']
+
+    with init_empty_weights():
+        model = YuanForCausalLM(yuan_config).eval()
+
+    for name, weight in state_dict.items():
+        set_module_tensor_to_device(model, name, "cpu", weight, dtype=dtype)
+
+    model = model.cpu()
+
+    # see https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
+    from transformers.convert_slow_tokenizer import import_protobuf
+    spm_pb2 = import_protobuf("Failed to import protobuf")
+
+    pieces = loader.tokenizer_pieces()
+    trainer_spec = spm_pb2.TrainerSpec(byte_fallback=True,
+                                       model_type=spm_pb2.TrainerSpec.ModelType.BPE)
+    proto = spm_pb2.ModelProto(pieces=pieces, trainer_spec=trainer_spec)
+    proto = proto.SerializeToString()
+
+    with NamedTemporaryFile(delete=False) as f:
+        f.write(proto)
+        f.close()
+        tokenizer = LlamaTokenizer(f.name)
+        os.remove(f.name)
+
+    tokenizer.add_eos_token = False
+    tokenizer.add_bos_token = False
+    tokenizer.eos_token = '<eod>'
+
+    tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>', '<commit_before>',  # noqa
+                         '<commit_msg>', '<commit_after>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<empty_output>'], special_tokens=True)  # noqa
+
+    return model, tokenizer
--- a/python/llm/src/bigdl/llm/transformers/model.py
+++ b/python/llm/src/bigdl/llm/transformers/model.py
@ -319,7 +319,8 @@ class _BaseAutoModelClass:
        return model

    @staticmethod
-    def from_gguf(fpath: str, optimize_model: bool = True, cpu_embedding: bool = False):
+    def from_gguf(fpath: str, optimize_model: bool = True,
+                  cpu_embedding: bool = False, low_bit: str = "sym_int4"):
        """
        Load gguf model and tokenizer and convert it to bigdl-llm model and huggingface tokenzier

@ -332,7 +333,7 @@ class _BaseAutoModelClass:
        """
        from bigdl.llm.optimize import optimize_model as optimize_model_fn

-        model, tokenizer, low_bit = load_gguf_model(fpath, dtype=torch.half)
+        model, tokenizer = load_gguf_model(fpath, dtype=torch.half, low_bit=low_bit)
        model = optimize_model_fn(model, low_bit=low_bit, optimize_llm=optimize_model,
                                  cpu_embedding=cpu_embedding)
        return model, tokenizer