LLM : Enable gguf float16 and Yuan2 model (#10372)
* enable float16 * add yun files * enable yun * enable set low_bit on yuan2 * update * update license * update generate * update readme * update python style * update
This commit is contained in:
parent
f5d65203c0
commit
0193f29411
10 changed files with 1345 additions and 7 deletions
|
|
@ -21,7 +21,7 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
|
|||
PYTHON_ROOT_DIR="$SCRIPT_DIR/.."
|
||||
echo $PYTHON_ROOT_DIR
|
||||
PATHS_TO_CHECK="$SCRIPT_DIR/../../src"
|
||||
PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/bigdl/llm/langchain/*"
|
||||
PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/bigdl/llm/langchain/*,$SCRIPT_DIR/../../src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/*"
|
||||
PEP8_REPORT_PATH="$PYTHON_ROOT_DIR/test/pep8-report.txt"
|
||||
PYLINT_REPORT_PATH="$PYTHON_ROOT_DIR/test/pylint-report.txt"
|
||||
PYLINT_INSTALL_INFO="$PYTHON_ROOT_DIR/test/pylint-info.txt"
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ In this directory, you will find examples on how to load GGUF model into `bigdl-
|
|||
- [Bloomz-7b1-GGUF](https://huggingface.co/hzjane/bloomz-7b1-gguf)
|
||||
- [falcon-7b-quantized-gguf](https://huggingface.co/xaviviro/falcon-7b-quantized-gguf/tree/main)
|
||||
- [mpt-7b-chat-gguf](https://huggingface.co/maddes8cht/mosaicml-mpt-7b-chat-gguf/tree/main)
|
||||
- [Yuan2-2B-Februa-hf-GGUF](https://huggingface.co/IEITYuan/Yuan2-2B-Februa-hf-GGUF/tree/main)
|
||||
|
||||
## Requirements
|
||||
To run these examples with BigDL-LLM, we have some recommended requirements for your machine, please refer to [here](../../../README.md#system-support) for more information.
|
||||
|
|
|
|||
|
|
@ -36,13 +36,16 @@ if __name__ == '__main__':
|
|||
help='Prompt to infer')
|
||||
parser.add_argument('--n-predict', type=int, default=32,
|
||||
help='Max tokens to predict')
|
||||
parser.add_argument('--low_bit', type=str, default="sym_int4",
|
||||
help='what low_bit to run bigdl-llm')
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
model_path = args.model
|
||||
|
||||
# Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
|
||||
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path)
|
||||
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)
|
||||
|
||||
# Generate predicted tokens
|
||||
with torch.inference_mode():
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ from bigdl.llm.utils.common import invalidInputError
|
|||
|
||||
|
||||
qtype_map = {
|
||||
1: "fp16", # float16
|
||||
2: "sym_int4", # q4_0
|
||||
3: "asym_int4", # q4_1
|
||||
7: "sym_int8", # q8_0
|
||||
|
|
@ -27,7 +28,7 @@ qtype_map = {
|
|||
}
|
||||
|
||||
|
||||
def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
|
||||
def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str = "sym_int4"):
|
||||
from .gguf import GGUFFileLoader
|
||||
|
||||
loader = GGUFFileLoader(fpath)
|
||||
|
|
@ -48,6 +49,9 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
|
|||
elif "mistral" in general_name:
|
||||
from .models.mistral import load_gguf_mistral
|
||||
model, tokenizer = load_gguf_mistral(loader, dtype)
|
||||
elif "yuan" in general_name:
|
||||
from .models.yuan2 import load_gguf_yuan
|
||||
model, tokenizer = load_gguf_yuan(loader, dtype)
|
||||
else:
|
||||
from .models.llama import load_gguf_llama
|
||||
model, tokenizer = load_gguf_llama(loader, dtype)
|
||||
|
|
@ -66,4 +70,4 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float):
|
|||
else:
|
||||
invalidInputError(False, f"Unsupported model family: {model_family}")
|
||||
|
||||
return model, tokenizer, low_bit
|
||||
return model, tokenizer
|
||||
|
|
|
|||
|
|
@ -268,7 +268,7 @@ class GGUFTensorLoader:
|
|||
return tensor.view(torch.float)
|
||||
|
||||
def convert_f16_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
|
||||
return tensor.view(torch.half)
|
||||
return tensor.view(torch.half).reshape(dims)
|
||||
|
||||
def convert_q4_0_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
|
||||
# see https://github.com/ggerganov/llama.cpp/blob
|
||||
|
|
|
|||
|
|
@ -0,0 +1,15 @@
|
|||
#
|
||||
# Copyright 2016 The BigDL Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
#
|
||||
# Copyright 2016 The BigDL Authors.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class YuanConfig(PretrainedConfig):
|
||||
model_type = "yuan"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=135040,
|
||||
hidden_size=2048,
|
||||
intermediate_size=8192,
|
||||
num_hidden_layers=24,
|
||||
num_attention_heads=32,
|
||||
hidden_act="silu",
|
||||
model_max_length=8192,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
pad_token_id=77185,
|
||||
bos_token_id=77185,
|
||||
eos_token_id=77185,
|
||||
tie_word_embeddings=True,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.model_max_length = model_max_length
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
File diff suppressed because it is too large
Load diff
130
python/llm/src/bigdl/llm/transformers/gguf/models/yuan2.py
Normal file
130
python/llm/src/bigdl/llm/transformers/gguf/models/yuan2.py
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
#
|
||||
# Copyright 2016 The BigDL Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
import torch
|
||||
from accelerate import init_empty_weights
|
||||
from accelerate.utils import set_module_tensor_to_device
|
||||
from tempfile import NamedTemporaryFile
|
||||
from transformers import LlamaTokenizer
|
||||
|
||||
from ..gguf import GGUFFileLoader
|
||||
from .model_implement.yuan2.yuan_hf_model import YuanForCausalLM
|
||||
from .model_implement.yuan2.configuration_yuan import YuanConfig
|
||||
|
||||
|
||||
def load_gguf_yuan(loader: GGUFFileLoader, dtype: torch.dtype = torch.float,
|
||||
low_bit='sym_int4'):
|
||||
config = loader.config
|
||||
yuan_config = YuanConfig(
|
||||
vocab_size=len(config['tokenizer.ggml.tokens']),
|
||||
hidden_size=config['llama.embedding_length'],
|
||||
intermediate_size=config['llama.feed_forward_length'],
|
||||
num_hidden_layers=config['llama.block_count'],
|
||||
num_attention_heads=config['llama.attention.head_count'],
|
||||
hidden_act="silu",
|
||||
model_max_length=config['llama.context_length'],
|
||||
rms_norm_eps=config['llama.attention.layer_norm_rms_epsilon'],
|
||||
use_cache=True,
|
||||
initializer_range=0.02,
|
||||
pad_token_id=config['tokenizer.ggml.padding_token_id'],
|
||||
bos_token_id=config['tokenizer.ggml.bos_token_id'],
|
||||
eos_token_id=config['tokenizer.ggml.eos_token_id'],
|
||||
eod_token=config['tokenizer.ggml.eos_token_id'],
|
||||
eod_token_id=config['tokenizer.ggml.eos_token_id'],
|
||||
sep_token=config['tokenizer.ggml.seperator_token_id'],
|
||||
sep_token_id=config['tokenizer.ggml.seperator_token_id'],
|
||||
mask_token_id=config['tokenizer.ggml.seperator_token_id'],
|
||||
use_loss_mask=False,
|
||||
dropout=0.1,
|
||||
reset_attention_mask=True,
|
||||
reset_position_ids=True,
|
||||
max_position_embeddings=config['llama.context_length'],
|
||||
causal_mask=True,
|
||||
use_flash_attention=False,
|
||||
pretraining_tp=1,
|
||||
)
|
||||
|
||||
ckpt = loader.tensors(dtype)
|
||||
n_head = config['llama.attention.head_count']
|
||||
n_head_kv = config['llama.attention.head_count_kv']
|
||||
|
||||
state_dict = {}
|
||||
state_dict['model.embed_tokens.weight'] = ckpt['token_embd.weight']
|
||||
state_dict['model.norm.weight'] = ckpt['output_norm.weight']
|
||||
state_dict['lm_head.weight'] = ckpt['output.weight']
|
||||
for i in range(config['llama.block_count']):
|
||||
state_dict[f'model.layers.{i}.self_attn.q_proj.weight'] = \
|
||||
ckpt[f'blk.{i}.attn_q.weight']
|
||||
state_dict[f'model.layers.{i}.self_attn.k_proj.weight'] = \
|
||||
ckpt[f'blk.{i}.attn_k.weight']
|
||||
state_dict[f'model.layers.{i}.self_attn.v_proj.weight'] = \
|
||||
ckpt[f'blk.{i}.attn_v.weight']
|
||||
state_dict[f'model.layers.{i}.self_attn.o_proj.weight'] = \
|
||||
ckpt[f'blk.{i}.attn_output.weight']
|
||||
state_dict[f'model.layers.{i}.mlp.gate_proj.weight'] = \
|
||||
ckpt[f'blk.{i}.ffn_gate.weight']
|
||||
state_dict[f'model.layers.{i}.mlp.up_proj.weight'] = \
|
||||
ckpt[f'blk.{i}.ffn_up.weight']
|
||||
state_dict[f'model.layers.{i}.mlp.down_proj.weight'] = \
|
||||
ckpt[f'blk.{i}.ffn_down.weight']
|
||||
state_dict[f'model.layers.{i}.input_layernorm.weight'] = \
|
||||
ckpt[f'blk.{i}.attn_norm.weight']
|
||||
state_dict[f'model.layers.{i}.post_attention_layernorm.weight'] = \
|
||||
ckpt[f'blk.{i}.ffn_norm.weight']
|
||||
state_dict[f'model.layers.{i}.self_attn.lf_gate.output_layernorm.weight'] = \
|
||||
ckpt[f'blk.{i}.lf_output_norm.weight']
|
||||
state_dict[f'model.layers.{i}.self_attn.lf_gate.conv1.weight'] = \
|
||||
ckpt[f'blk.{i}.conv1.weight']
|
||||
state_dict[f'model.layers.{i}.self_attn.lf_gate.conv2.weight'] = \
|
||||
ckpt[f'blk.{i}.conv2.weight']
|
||||
state_dict[f'model.layers.{i}.self_attn.lf_gate.conv1.bias'] = \
|
||||
ckpt[f'blk.{i}.conv1.bias']
|
||||
state_dict[f'model.layers.{i}.self_attn.lf_gate.conv2.bias'] = \
|
||||
ckpt[f'blk.{i}.conv2.bias']
|
||||
|
||||
with init_empty_weights():
|
||||
model = YuanForCausalLM(yuan_config).eval()
|
||||
|
||||
for name, weight in state_dict.items():
|
||||
set_module_tensor_to_device(model, name, "cpu", weight, dtype=dtype)
|
||||
|
||||
model = model.cpu()
|
||||
|
||||
# see https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
||||
from transformers.convert_slow_tokenizer import import_protobuf
|
||||
spm_pb2 = import_protobuf("Failed to import protobuf")
|
||||
|
||||
pieces = loader.tokenizer_pieces()
|
||||
trainer_spec = spm_pb2.TrainerSpec(byte_fallback=True,
|
||||
model_type=spm_pb2.TrainerSpec.ModelType.BPE)
|
||||
proto = spm_pb2.ModelProto(pieces=pieces, trainer_spec=trainer_spec)
|
||||
proto = proto.SerializeToString()
|
||||
|
||||
with NamedTemporaryFile(delete=False) as f:
|
||||
f.write(proto)
|
||||
f.close()
|
||||
tokenizer = LlamaTokenizer(f.name)
|
||||
os.remove(f.name)
|
||||
|
||||
tokenizer.add_eos_token = False
|
||||
tokenizer.add_bos_token = False
|
||||
tokenizer.eos_token = '<eod>'
|
||||
|
||||
tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>', '<commit_before>', # noqa
|
||||
'<commit_msg>', '<commit_after>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<empty_output>'], special_tokens=True) # noqa
|
||||
|
||||
return model, tokenizer
|
||||
|
|
@ -319,7 +319,8 @@ class _BaseAutoModelClass:
|
|||
return model
|
||||
|
||||
@staticmethod
|
||||
def from_gguf(fpath: str, optimize_model: bool = True, cpu_embedding: bool = False):
|
||||
def from_gguf(fpath: str, optimize_model: bool = True,
|
||||
cpu_embedding: bool = False, low_bit: str = "sym_int4"):
|
||||
"""
|
||||
Load gguf model and tokenizer and convert it to bigdl-llm model and huggingface tokenzier
|
||||
|
||||
|
|
@ -332,7 +333,7 @@ class _BaseAutoModelClass:
|
|||
"""
|
||||
from bigdl.llm.optimize import optimize_model as optimize_model_fn
|
||||
|
||||
model, tokenizer, low_bit = load_gguf_model(fpath, dtype=torch.half)
|
||||
model, tokenizer = load_gguf_model(fpath, dtype=torch.half, low_bit=low_bit)
|
||||
model = optimize_model_fn(model, low_bit=low_bit, optimize_llm=optimize_model,
|
||||
cpu_embedding=cpu_embedding)
|
||||
return model, tokenizer
|
||||
|
|
|
|||
Loading…
Reference in a new issue