LLM : Enable gguf float16 and Yuan2 model (#10372)
* enable float16 * add yun files * enable yun * enable set low_bit on yuan2 * update * update license * update generate * update readme * update python style * update
This commit is contained in:
		
							parent
							
								
									f5d65203c0
								
							
						
					
					
						commit
						0193f29411
					
				
					 10 changed files with 1345 additions and 7 deletions
				
			
		| 
						 | 
				
			
			@ -21,7 +21,7 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 | 
			
		|||
PYTHON_ROOT_DIR="$SCRIPT_DIR/.."
 | 
			
		||||
echo $PYTHON_ROOT_DIR
 | 
			
		||||
PATHS_TO_CHECK="$SCRIPT_DIR/../../src"
 | 
			
		||||
PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/bigdl/llm/langchain/*"
 | 
			
		||||
PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/bigdl/llm/langchain/*,$SCRIPT_DIR/../../src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/*"
 | 
			
		||||
PEP8_REPORT_PATH="$PYTHON_ROOT_DIR/test/pep8-report.txt"
 | 
			
		||||
PYLINT_REPORT_PATH="$PYTHON_ROOT_DIR/test/pylint-report.txt"
 | 
			
		||||
PYLINT_INSTALL_INFO="$PYTHON_ROOT_DIR/test/pylint-info.txt"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -9,6 +9,7 @@ In this directory, you will find examples on how to load GGUF model into `bigdl-
 | 
			
		|||
- [Bloomz-7b1-GGUF](https://huggingface.co/hzjane/bloomz-7b1-gguf)
 | 
			
		||||
- [falcon-7b-quantized-gguf](https://huggingface.co/xaviviro/falcon-7b-quantized-gguf/tree/main)
 | 
			
		||||
- [mpt-7b-chat-gguf](https://huggingface.co/maddes8cht/mosaicml-mpt-7b-chat-gguf/tree/main)
 | 
			
		||||
- [Yuan2-2B-Februa-hf-GGUF](https://huggingface.co/IEITYuan/Yuan2-2B-Februa-hf-GGUF/tree/main)
 | 
			
		||||
 | 
			
		||||
## Requirements
 | 
			
		||||
To run these examples with BigDL-LLM, we have some recommended requirements for your machine, please refer to [here](../../../README.md#system-support) for more information.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -36,13 +36,16 @@ if __name__ == '__main__':
 | 
			
		|||
                        help='Prompt to infer')
 | 
			
		||||
    parser.add_argument('--n-predict', type=int, default=32,
 | 
			
		||||
                        help='Max tokens to predict')
 | 
			
		||||
    parser.add_argument('--low_bit', type=str, default="sym_int4",
 | 
			
		||||
                        help='what low_bit to run bigdl-llm')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
 | 
			
		||||
    model_path = args.model
 | 
			
		||||
 | 
			
		||||
    # Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
 | 
			
		||||
    model, tokenizer = AutoModelForCausalLM.from_gguf(model_path)
 | 
			
		||||
    model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)
 | 
			
		||||
 | 
			
		||||
    # Generate predicted tokens
 | 
			
		||||
    with torch.inference_mode():
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -19,6 +19,7 @@ from bigdl.llm.utils.common import invalidInputError
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
qtype_map = {
 | 
			
		||||
    1: "fp16",           # float16
 | 
			
		||||
    2: "sym_int4",      # q4_0
 | 
			
		||||
    3: "asym_int4",     # q4_1
 | 
			
		||||
    7: "sym_int8",      # q8_0
 | 
			
		||||
| 
						 | 
				
			
			@ -27,7 +28,7 @@ qtype_map = {
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
 | 
			
		||||
def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str = "sym_int4"):
 | 
			
		||||
    from .gguf import GGUFFileLoader
 | 
			
		||||
 | 
			
		||||
    loader = GGUFFileLoader(fpath)
 | 
			
		||||
| 
						 | 
				
			
			@ -48,6 +49,9 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
 | 
			
		|||
            elif "mistral" in general_name:
 | 
			
		||||
                from .models.mistral import load_gguf_mistral
 | 
			
		||||
                model, tokenizer = load_gguf_mistral(loader, dtype)
 | 
			
		||||
            elif "yuan" in general_name:
 | 
			
		||||
                from .models.yuan2 import load_gguf_yuan
 | 
			
		||||
                model, tokenizer = load_gguf_yuan(loader, dtype)
 | 
			
		||||
            else:
 | 
			
		||||
                from .models.llama import load_gguf_llama
 | 
			
		||||
                model, tokenizer = load_gguf_llama(loader, dtype)
 | 
			
		||||
| 
						 | 
				
			
			@ -66,4 +70,4 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
 | 
			
		|||
        else:
 | 
			
		||||
            invalidInputError(False, f"Unsupported model family: {model_family}")
 | 
			
		||||
 | 
			
		||||
        return model, tokenizer, low_bit
 | 
			
		||||
        return model, tokenizer
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -268,7 +268,7 @@ class GGUFTensorLoader:
 | 
			
		|||
        return tensor.view(torch.float)
 | 
			
		||||
 | 
			
		||||
    def convert_f16_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
 | 
			
		||||
        return tensor.view(torch.half)
 | 
			
		||||
        return tensor.view(torch.half).reshape(dims)
 | 
			
		||||
 | 
			
		||||
    def convert_q4_0_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
 | 
			
		||||
        # see https://github.com/ggerganov/llama.cpp/blob
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,15 @@
 | 
			
		|||
#
 | 
			
		||||
# Copyright 2016 The BigDL Authors.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
#
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,63 @@
 | 
			
		|||
#
 | 
			
		||||
# Copyright 2016 The BigDL Authors.
 | 
			
		||||
#
 | 
			
		||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 | 
			
		||||
# and OPT implementations in this library. It has been modified from its
 | 
			
		||||
# original forms to accommodate minor architectural differences compared
 | 
			
		||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
 | 
			
		||||
from transformers.configuration_utils import PretrainedConfig
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class YuanConfig(PretrainedConfig):
 | 
			
		||||
    model_type = "yuan"
 | 
			
		||||
    keys_to_ignore_at_inference = ["past_key_values"]
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        vocab_size=135040,
 | 
			
		||||
        hidden_size=2048,
 | 
			
		||||
        intermediate_size=8192,
 | 
			
		||||
        num_hidden_layers=24,
 | 
			
		||||
        num_attention_heads=32,
 | 
			
		||||
        hidden_act="silu",
 | 
			
		||||
        model_max_length=8192,
 | 
			
		||||
        initializer_range=0.02,
 | 
			
		||||
        rms_norm_eps=1e-6,
 | 
			
		||||
        use_cache=True,
 | 
			
		||||
        pad_token_id=77185,
 | 
			
		||||
        bos_token_id=77185,
 | 
			
		||||
        eos_token_id=77185,
 | 
			
		||||
        tie_word_embeddings=True,
 | 
			
		||||
        **kwargs,
 | 
			
		||||
    ):
 | 
			
		||||
        self.vocab_size = vocab_size
 | 
			
		||||
        self.model_max_length = model_max_length
 | 
			
		||||
        self.hidden_size = hidden_size
 | 
			
		||||
        self.intermediate_size = intermediate_size
 | 
			
		||||
        self.num_hidden_layers = num_hidden_layers
 | 
			
		||||
        self.num_attention_heads = num_attention_heads
 | 
			
		||||
        self.hidden_act = hidden_act
 | 
			
		||||
        self.initializer_range = initializer_range
 | 
			
		||||
        self.rms_norm_eps = rms_norm_eps
 | 
			
		||||
        self.use_cache = use_cache
 | 
			
		||||
        super().__init__(
 | 
			
		||||
            pad_token_id=pad_token_id,
 | 
			
		||||
            bos_token_id=bos_token_id,
 | 
			
		||||
            eos_token_id=eos_token_id,
 | 
			
		||||
            tie_word_embeddings=tie_word_embeddings,
 | 
			
		||||
            **kwargs,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										130
									
								
								python/llm/src/bigdl/llm/transformers/gguf/models/yuan2.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										130
									
								
								python/llm/src/bigdl/llm/transformers/gguf/models/yuan2.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,130 @@
 | 
			
		|||
#
 | 
			
		||||
# Copyright 2016 The BigDL Authors.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import torch
 | 
			
		||||
from accelerate import init_empty_weights
 | 
			
		||||
from accelerate.utils import set_module_tensor_to_device
 | 
			
		||||
from tempfile import NamedTemporaryFile
 | 
			
		||||
from transformers import LlamaTokenizer
 | 
			
		||||
 | 
			
		||||
from ..gguf import GGUFFileLoader
 | 
			
		||||
from .model_implement.yuan2.yuan_hf_model import YuanForCausalLM
 | 
			
		||||
from .model_implement.yuan2.configuration_yuan import YuanConfig
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_gguf_yuan(loader: GGUFFileLoader, dtype: torch.dtype = torch.float,
 | 
			
		||||
                   low_bit='sym_int4'):
 | 
			
		||||
    config = loader.config
 | 
			
		||||
    yuan_config = YuanConfig(
 | 
			
		||||
        vocab_size=len(config['tokenizer.ggml.tokens']),
 | 
			
		||||
        hidden_size=config['llama.embedding_length'],
 | 
			
		||||
        intermediate_size=config['llama.feed_forward_length'],
 | 
			
		||||
        num_hidden_layers=config['llama.block_count'],
 | 
			
		||||
        num_attention_heads=config['llama.attention.head_count'],
 | 
			
		||||
        hidden_act="silu",
 | 
			
		||||
        model_max_length=config['llama.context_length'],
 | 
			
		||||
        rms_norm_eps=config['llama.attention.layer_norm_rms_epsilon'],
 | 
			
		||||
        use_cache=True,
 | 
			
		||||
        initializer_range=0.02,
 | 
			
		||||
        pad_token_id=config['tokenizer.ggml.padding_token_id'],
 | 
			
		||||
        bos_token_id=config['tokenizer.ggml.bos_token_id'],
 | 
			
		||||
        eos_token_id=config['tokenizer.ggml.eos_token_id'],
 | 
			
		||||
        eod_token=config['tokenizer.ggml.eos_token_id'],
 | 
			
		||||
        eod_token_id=config['tokenizer.ggml.eos_token_id'],
 | 
			
		||||
        sep_token=config['tokenizer.ggml.seperator_token_id'],
 | 
			
		||||
        sep_token_id=config['tokenizer.ggml.seperator_token_id'],
 | 
			
		||||
        mask_token_id=config['tokenizer.ggml.seperator_token_id'],
 | 
			
		||||
        use_loss_mask=False,
 | 
			
		||||
        dropout=0.1,
 | 
			
		||||
        reset_attention_mask=True,
 | 
			
		||||
        reset_position_ids=True,
 | 
			
		||||
        max_position_embeddings=config['llama.context_length'],
 | 
			
		||||
        causal_mask=True,
 | 
			
		||||
        use_flash_attention=False,
 | 
			
		||||
        pretraining_tp=1,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    ckpt = loader.tensors(dtype)
 | 
			
		||||
    n_head = config['llama.attention.head_count']
 | 
			
		||||
    n_head_kv = config['llama.attention.head_count_kv']
 | 
			
		||||
 | 
			
		||||
    state_dict = {}
 | 
			
		||||
    state_dict['model.embed_tokens.weight'] = ckpt['token_embd.weight']
 | 
			
		||||
    state_dict['model.norm.weight'] = ckpt['output_norm.weight']
 | 
			
		||||
    state_dict['lm_head.weight'] = ckpt['output.weight']
 | 
			
		||||
    for i in range(config['llama.block_count']):
 | 
			
		||||
        state_dict[f'model.layers.{i}.self_attn.q_proj.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.attn_q.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.self_attn.k_proj.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.attn_k.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.self_attn.v_proj.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.attn_v.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.self_attn.o_proj.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.attn_output.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.mlp.gate_proj.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.ffn_gate.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.mlp.up_proj.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.ffn_up.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.mlp.down_proj.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.ffn_down.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.input_layernorm.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.attn_norm.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.post_attention_layernorm.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.ffn_norm.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.self_attn.lf_gate.output_layernorm.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.lf_output_norm.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.self_attn.lf_gate.conv1.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.conv1.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.self_attn.lf_gate.conv2.weight'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.conv2.weight']
 | 
			
		||||
        state_dict[f'model.layers.{i}.self_attn.lf_gate.conv1.bias'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.conv1.bias']
 | 
			
		||||
        state_dict[f'model.layers.{i}.self_attn.lf_gate.conv2.bias'] = \
 | 
			
		||||
            ckpt[f'blk.{i}.conv2.bias']
 | 
			
		||||
 | 
			
		||||
    with init_empty_weights():
 | 
			
		||||
        model = YuanForCausalLM(yuan_config).eval()
 | 
			
		||||
 | 
			
		||||
    for name, weight in state_dict.items():
 | 
			
		||||
        set_module_tensor_to_device(model, name, "cpu", weight, dtype=dtype)
 | 
			
		||||
 | 
			
		||||
    model = model.cpu()
 | 
			
		||||
 | 
			
		||||
    # see https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
 | 
			
		||||
    from transformers.convert_slow_tokenizer import import_protobuf
 | 
			
		||||
    spm_pb2 = import_protobuf("Failed to import protobuf")
 | 
			
		||||
 | 
			
		||||
    pieces = loader.tokenizer_pieces()
 | 
			
		||||
    trainer_spec = spm_pb2.TrainerSpec(byte_fallback=True,
 | 
			
		||||
                                       model_type=spm_pb2.TrainerSpec.ModelType.BPE)
 | 
			
		||||
    proto = spm_pb2.ModelProto(pieces=pieces, trainer_spec=trainer_spec)
 | 
			
		||||
    proto = proto.SerializeToString()
 | 
			
		||||
 | 
			
		||||
    with NamedTemporaryFile(delete=False) as f:
 | 
			
		||||
        f.write(proto)
 | 
			
		||||
        f.close()
 | 
			
		||||
        tokenizer = LlamaTokenizer(f.name)
 | 
			
		||||
        os.remove(f.name)
 | 
			
		||||
 | 
			
		||||
    tokenizer.add_eos_token = False
 | 
			
		||||
    tokenizer.add_bos_token = False
 | 
			
		||||
    tokenizer.eos_token = '<eod>'
 | 
			
		||||
 | 
			
		||||
    tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>', '<commit_before>',  # noqa
 | 
			
		||||
                         '<commit_msg>', '<commit_after>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<empty_output>'], special_tokens=True)  # noqa
 | 
			
		||||
 | 
			
		||||
    return model, tokenizer
 | 
			
		||||
| 
						 | 
				
			
			@ -319,7 +319,8 @@ class _BaseAutoModelClass:
 | 
			
		|||
        return model
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def from_gguf(fpath: str, optimize_model: bool = True, cpu_embedding: bool = False):
 | 
			
		||||
    def from_gguf(fpath: str, optimize_model: bool = True,
 | 
			
		||||
                  cpu_embedding: bool = False, low_bit: str = "sym_int4"):
 | 
			
		||||
        """
 | 
			
		||||
        Load gguf model and tokenizer and convert it to bigdl-llm model and huggingface tokenzier
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -332,7 +333,7 @@ class _BaseAutoModelClass:
 | 
			
		|||
        """
 | 
			
		||||
        from bigdl.llm.optimize import optimize_model as optimize_model_fn
 | 
			
		||||
 | 
			
		||||
        model, tokenizer, low_bit = load_gguf_model(fpath, dtype=torch.half)
 | 
			
		||||
        model, tokenizer = load_gguf_model(fpath, dtype=torch.half, low_bit=low_bit)
 | 
			
		||||
        model = optimize_model_fn(model, low_bit=low_bit, optimize_llm=optimize_model,
 | 
			
		||||
                                  cpu_embedding=cpu_embedding)
 | 
			
		||||
        return model, tokenizer
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue