LLM : Enable gguf float16 and Yuan2 model (#10372)

* enable float16

* add yun files

* enable yun

* enable set low_bit on yuan2

* update

* update license

* update generate

* update readme

* update python style

* update
This commit is contained in:
Wang, Jian4 2024-03-13 10:19:18 +08:00 committed by GitHub
parent f5d65203c0
commit 0193f29411
10 changed files with 1345 additions and 7 deletions

View file

@ -21,7 +21,7 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
PYTHON_ROOT_DIR="$SCRIPT_DIR/.."
echo $PYTHON_ROOT_DIR
PATHS_TO_CHECK="$SCRIPT_DIR/../../src"
PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/bigdl/llm/langchain/*"
PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/bigdl/llm/langchain/*,$SCRIPT_DIR/../../src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/*"
PEP8_REPORT_PATH="$PYTHON_ROOT_DIR/test/pep8-report.txt"
PYLINT_REPORT_PATH="$PYTHON_ROOT_DIR/test/pylint-report.txt"
PYLINT_INSTALL_INFO="$PYTHON_ROOT_DIR/test/pylint-info.txt"

View file

@ -9,6 +9,7 @@ In this directory, you will find examples on how to load GGUF model into `bigdl-
- [Bloomz-7b1-GGUF](https://huggingface.co/hzjane/bloomz-7b1-gguf)
- [falcon-7b-quantized-gguf](https://huggingface.co/xaviviro/falcon-7b-quantized-gguf/tree/main)
- [mpt-7b-chat-gguf](https://huggingface.co/maddes8cht/mosaicml-mpt-7b-chat-gguf/tree/main)
- [Yuan2-2B-Februa-hf-GGUF](https://huggingface.co/IEITYuan/Yuan2-2B-Februa-hf-GGUF/tree/main)
## Requirements
To run these examples with BigDL-LLM, we have some recommended requirements for your machine, please refer to [here](../../../README.md#system-support) for more information.

View file

@ -36,13 +36,16 @@ if __name__ == '__main__':
help='Prompt to infer')
parser.add_argument('--n-predict', type=int, default=32,
help='Max tokens to predict')
parser.add_argument('--low_bit', type=str, default="sym_int4",
help='what low_bit to run bigdl-llm')
args = parser.parse_args()
model_path = args.model
# Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path)
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)
# Generate predicted tokens
with torch.inference_mode():

View file

@ -19,6 +19,7 @@ from bigdl.llm.utils.common import invalidInputError
qtype_map = {
1: "fp16", # float16
2: "sym_int4", # q4_0
3: "asym_int4", # q4_1
7: "sym_int8", # q8_0
@ -27,7 +28,7 @@ qtype_map = {
}
def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str = "sym_int4"):
from .gguf import GGUFFileLoader
loader = GGUFFileLoader(fpath)
@ -48,6 +49,9 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
elif "mistral" in general_name:
from .models.mistral import load_gguf_mistral
model, tokenizer = load_gguf_mistral(loader, dtype)
elif "yuan" in general_name:
from .models.yuan2 import load_gguf_yuan
model, tokenizer = load_gguf_yuan(loader, dtype)
else:
from .models.llama import load_gguf_llama
model, tokenizer = load_gguf_llama(loader, dtype)
@ -66,4 +70,4 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
else:
invalidInputError(False, f"Unsupported model family: {model_family}")
return model, tokenizer, low_bit
return model, tokenizer

View file

@ -268,7 +268,7 @@ class GGUFTensorLoader:
return tensor.view(torch.float)
def convert_f16_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
return tensor.view(torch.half)
return tensor.view(torch.half).reshape(dims)
def convert_q4_0_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
# see https://github.com/ggerganov/llama.cpp/blob

View file

@ -0,0 +1,15 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

View file

@ -0,0 +1,63 @@
#
# Copyright 2016 The BigDL Authors.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from transformers.configuration_utils import PretrainedConfig
class YuanConfig(PretrainedConfig):
model_type = "yuan"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=135040,
hidden_size=2048,
intermediate_size=8192,
num_hidden_layers=24,
num_attention_heads=32,
hidden_act="silu",
model_max_length=8192,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=77185,
bos_token_id=77185,
eos_token_id=77185,
tie_word_embeddings=True,
**kwargs,
):
self.vocab_size = vocab_size
self.model_max_length = model_max_length
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)

View file

@ -0,0 +1,130 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import torch
from accelerate import init_empty_weights
from accelerate.utils import set_module_tensor_to_device
from tempfile import NamedTemporaryFile
from transformers import LlamaTokenizer
from ..gguf import GGUFFileLoader
from .model_implement.yuan2.yuan_hf_model import YuanForCausalLM
from .model_implement.yuan2.configuration_yuan import YuanConfig
def load_gguf_yuan(loader: GGUFFileLoader, dtype: torch.dtype = torch.float,
low_bit='sym_int4'):
config = loader.config
yuan_config = YuanConfig(
vocab_size=len(config['tokenizer.ggml.tokens']),
hidden_size=config['llama.embedding_length'],
intermediate_size=config['llama.feed_forward_length'],
num_hidden_layers=config['llama.block_count'],
num_attention_heads=config['llama.attention.head_count'],
hidden_act="silu",
model_max_length=config['llama.context_length'],
rms_norm_eps=config['llama.attention.layer_norm_rms_epsilon'],
use_cache=True,
initializer_range=0.02,
pad_token_id=config['tokenizer.ggml.padding_token_id'],
bos_token_id=config['tokenizer.ggml.bos_token_id'],
eos_token_id=config['tokenizer.ggml.eos_token_id'],
eod_token=config['tokenizer.ggml.eos_token_id'],
eod_token_id=config['tokenizer.ggml.eos_token_id'],
sep_token=config['tokenizer.ggml.seperator_token_id'],
sep_token_id=config['tokenizer.ggml.seperator_token_id'],
mask_token_id=config['tokenizer.ggml.seperator_token_id'],
use_loss_mask=False,
dropout=0.1,
reset_attention_mask=True,
reset_position_ids=True,
max_position_embeddings=config['llama.context_length'],
causal_mask=True,
use_flash_attention=False,
pretraining_tp=1,
)
ckpt = loader.tensors(dtype)
n_head = config['llama.attention.head_count']
n_head_kv = config['llama.attention.head_count_kv']
state_dict = {}
state_dict['model.embed_tokens.weight'] = ckpt['token_embd.weight']
state_dict['model.norm.weight'] = ckpt['output_norm.weight']
state_dict['lm_head.weight'] = ckpt['output.weight']
for i in range(config['llama.block_count']):
state_dict[f'model.layers.{i}.self_attn.q_proj.weight'] = \
ckpt[f'blk.{i}.attn_q.weight']
state_dict[f'model.layers.{i}.self_attn.k_proj.weight'] = \
ckpt[f'blk.{i}.attn_k.weight']
state_dict[f'model.layers.{i}.self_attn.v_proj.weight'] = \
ckpt[f'blk.{i}.attn_v.weight']
state_dict[f'model.layers.{i}.self_attn.o_proj.weight'] = \
ckpt[f'blk.{i}.attn_output.weight']
state_dict[f'model.layers.{i}.mlp.gate_proj.weight'] = \
ckpt[f'blk.{i}.ffn_gate.weight']
state_dict[f'model.layers.{i}.mlp.up_proj.weight'] = \
ckpt[f'blk.{i}.ffn_up.weight']
state_dict[f'model.layers.{i}.mlp.down_proj.weight'] = \
ckpt[f'blk.{i}.ffn_down.weight']
state_dict[f'model.layers.{i}.input_layernorm.weight'] = \
ckpt[f'blk.{i}.attn_norm.weight']
state_dict[f'model.layers.{i}.post_attention_layernorm.weight'] = \
ckpt[f'blk.{i}.ffn_norm.weight']
state_dict[f'model.layers.{i}.self_attn.lf_gate.output_layernorm.weight'] = \
ckpt[f'blk.{i}.lf_output_norm.weight']
state_dict[f'model.layers.{i}.self_attn.lf_gate.conv1.weight'] = \
ckpt[f'blk.{i}.conv1.weight']
state_dict[f'model.layers.{i}.self_attn.lf_gate.conv2.weight'] = \
ckpt[f'blk.{i}.conv2.weight']
state_dict[f'model.layers.{i}.self_attn.lf_gate.conv1.bias'] = \
ckpt[f'blk.{i}.conv1.bias']
state_dict[f'model.layers.{i}.self_attn.lf_gate.conv2.bias'] = \
ckpt[f'blk.{i}.conv2.bias']
with init_empty_weights():
model = YuanForCausalLM(yuan_config).eval()
for name, weight in state_dict.items():
set_module_tensor_to_device(model, name, "cpu", weight, dtype=dtype)
model = model.cpu()
# see https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
from transformers.convert_slow_tokenizer import import_protobuf
spm_pb2 = import_protobuf("Failed to import protobuf")
pieces = loader.tokenizer_pieces()
trainer_spec = spm_pb2.TrainerSpec(byte_fallback=True,
model_type=spm_pb2.TrainerSpec.ModelType.BPE)
proto = spm_pb2.ModelProto(pieces=pieces, trainer_spec=trainer_spec)
proto = proto.SerializeToString()
with NamedTemporaryFile(delete=False) as f:
f.write(proto)
f.close()
tokenizer = LlamaTokenizer(f.name)
os.remove(f.name)
tokenizer.add_eos_token = False
tokenizer.add_bos_token = False
tokenizer.eos_token = '<eod>'
tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>', '<commit_before>', # noqa
'<commit_msg>', '<commit_after>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<empty_output>'], special_tokens=True) # noqa
return model, tokenizer

View file

@ -319,7 +319,8 @@ class _BaseAutoModelClass:
return model
@staticmethod
def from_gguf(fpath: str, optimize_model: bool = True, cpu_embedding: bool = False):
def from_gguf(fpath: str, optimize_model: bool = True,
cpu_embedding: bool = False, low_bit: str = "sym_int4"):
"""
Load gguf model and tokenizer and convert it to bigdl-llm model and huggingface tokenzier
@ -332,7 +333,7 @@ class _BaseAutoModelClass:
"""
from bigdl.llm.optimize import optimize_model as optimize_model_fn
model, tokenizer, low_bit = load_gguf_model(fpath, dtype=torch.half)
model, tokenizer = load_gguf_model(fpath, dtype=torch.half, low_bit=low_bit)
model = optimize_model_fn(model, low_bit=low_bit, optimize_llm=optimize_model,
cpu_embedding=cpu_embedding)
return model, tokenizer