# # Copyright 2016 The BigDL Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # =========================================================================== # # This file is adapted from # https://github.com/ggerganov/llama.cpp/blob/master/convert.py # https://github.com/togethercomputer/redpajama.cpp/blob/master/examples/redpajama/scripts/convert_gptneox_to_ggml.py # https://github.com/NouamaneTazi/bloomz.cpp/blob/patch/convert-hf-to-ggml.py # # MIT License # # Copyright (c) 2023 Georgi Gerganov # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import concurrent.futures import copy import enum import faulthandler import functools import io import itertools import json import math import mmap import pickle import re import signal import struct import sys import zipfile from abc import ABCMeta, abstractmethod from dataclasses import dataclass from pathlib import Path from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) import numpy as np from sentencepiece import SentencePieceProcessor from ipex_llm.utils.common import invalidInputError import os from pathlib import Path if TYPE_CHECKING: from typing_extensions import TypeAlias if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): faulthandler.register(signal.SIGUSR1) NDArray = np.ndarray[Any, Any] __all__ = ['Params', 'OutputFile', 'load_some_model', 'do_necessary_conversions', 'pick_output_type', 'convert_to_output_type', 'load_vocab', 'default_outfile', '_convert_gptneox_hf_to_ggml', '_convert_bloom_hf_to_ggml', '_convert_starcoder_hf_to_ggml', '_convert_chatglm_hf_to_ggml'] @dataclass(frozen=True) class UnquantizedDataType: name: str DT_F16 = UnquantizedDataType('F16') DT_F32 = UnquantizedDataType('F32') DT_I32 = UnquantizedDataType('I32') DT_BF16 = UnquantizedDataType('BF16') @dataclass(frozen=True) class QuantizedDataType: groupsize: int have_addends: bool have_g_idx: bool DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False) DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False) DataType = Union[UnquantizedDataType, QuantizedDataType] DATA_TYPE_TO_FTYPE = Dict[DataType, int] DATA_TYPE_TO_FTYPE = {DT_F32: 0, DT_F16: 1, DT_Q4_0: 2, DT_Q4_1: 3} FTYPE_TO_DATA_TYPE = Dict[int, DataType] FTYPE_TO_DATA_TYPE = {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()} DATA_TYPE_TO_NUMPY = Dict[DataType, 'np.dtype[Any]'] DATA_TYPE_TO_NUMPY = {DT_BF16: np.dtype(np.uint16), DT_F16: np.dtype(np.float16), DT_F32: np.dtype(np.float32), DT_I32: np.dtype(np.int32)} NUMPY_TYPE_TO_DATA_TYPE = Dict['np.dtype[Any]', DataType] NUMPY_TYPE_TO_DATA_TYPE = {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()} class GGMLFileType(enum.Enum): AllF32 = 0 MostlyF16 = 1 # except 1d tensors MostlyQ4_0 = 2 # except 1d tensors MostlyQ4_1 = 3 # except 1d tensors PerLayerIsQ4_1 = 4 # but tok_embeddings.weight and output.weight are F16 def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType: if len(tensor.shape) == 1: # 1D tensors are always F32. return DT_F32 elif self == GGMLFileType.AllF32: return DT_F32 elif self == GGMLFileType.MostlyF16: return DT_F16 elif self == GGMLFileType.MostlyQ4_0: return DT_Q4_0 elif self == GGMLFileType.MostlyQ4_1: return DT_Q4_1 elif self == GGMLFileType.PerLayerIsQ4_1: if name in ('output.weight', 'tok_embeddings.weight'): return DT_F16 else: return DT_Q4_1 else: invalidInputError(False, 'There exists ValueError.') def make_tensors_list() -> List[str]: ret = [ 'tok_embeddings.weight', 'norm.weight', 'output.weight', ] for i in range(80): # maximum number of layer ret += [ f'layers.{i}.attention.wq.weight', f'layers.{i}.attention.wk.weight', f'layers.{i}.attention.wv.weight', f'layers.{i}.attention.wo.weight', f'layers.{i}.attention_norm.weight', f'layers.{i}.feed_forward.w1.weight', f'layers.{i}.feed_forward.w2.weight', f'layers.{i}.feed_forward.w3.weight', f'layers.{i}.atttention_norm.weight', f'layers.{i}.ffn_norm.weight', ] return ret TENSORS_LIST = make_tensors_list() TENSORS_SET = set(TENSORS_LIST) def find_n_mult(n_ff: int, n_embd: int) -> int: # hardcoded magic range for n_mult in range(8192, 1, -1): calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult if calc_ff == n_ff: return n_mult invalidInputError(False, f"Failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).") @dataclass class Params: n_vocab: int n_embd: int n_mult: int n_head: int n_layer: int n_kv_head: Optional[int] # This parameter is only used for Llama 2 @staticmethod def guessed(model: 'LazyModel') -> 'Params': # try transformer naming first if "model.embed_tokens.weight" in model: n_vocab, n_embd = model["model.embed_tokens.weight"].shape else: n_vocab, n_embd = model["tok_embeddings.weight"].shape # try transformer naming first if "model.layers.0.self_attn.q_proj.weight" in model: n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) else: n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) if n_layer < 1: invalidInputError(False, "Failed to guess 'n_layer'. This model is unknown or " "unsupported.\nSuggestion: provide 'config.json' of the " "model in the same directory containing model files.") n_head = n_embd // 128 # guessed return Params( n_vocab=n_vocab, n_embd=n_embd, n_mult=256, n_head=n_head, n_layer=n_layer, n_kv_head=None, ) @staticmethod def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params': config = json.load(open(config_path)) n_vocab = config["vocab_size"] n_embd = config["hidden_size"] n_head = config["num_attention_heads"] n_layer = config["num_hidden_layers"] n_ff = config["intermediate_size"] n_kv_head = config.get("num_key_value_heads") n_mult = find_n_mult(n_ff, n_embd) return Params( n_vocab=n_vocab, n_embd=n_embd, n_mult=n_mult, n_head=n_head, n_layer=n_layer, n_kv_head=n_kv_head, ) # LLaMA v2 70B params.json # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, # "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} @staticmethod def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params': config = json.load(open(config_path)) n_vocab = config["vocab_size"] n_embd = config["dim"] n_head = config["n_heads"] n_layer = config["n_layers"] n_mult = config["multiple_of"] if n_vocab == -1: n_vocab = model["tok_embeddings.weight"].shape[0] return Params( n_vocab=n_vocab, n_embd=n_embd, n_mult=n_mult, n_head=n_head, n_layer=n_layer, n_kv_head=None, ) @staticmethod def load(model_plus: 'ModelPlus') -> 'Params': hf_config_path = model_plus.paths[0].parent / "config.json" orig_config_path = model_plus.paths[0].parent / "params.json" if hf_config_path.exists(): params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) elif orig_config_path.exists(): params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) else: params = Params.guessed(model_plus.model) print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd}' f'n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}') return params class SentencePieceVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None: self.vocabtype = vocabtype if self.vocabtype == "bpe": self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read()) else: self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) added_tokens = Dict[str, int] if fname_added_tokens is not None: added_tokens = json.load(open(fname_added_tokens)) else: added_tokens = {} if self.vocabtype == "bpe": vocab_size: int = len(self.sentencepiece_tokenizer) else: vocab_size: int = self.sentencepiece_tokenizer.vocab_size() expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) actual_ids = sorted(added_tokens.values()) invalidInputError(expected_ids == actual_ids, "Expected added token IDs to be sequential and start " f"at {len(added_tokens)}; got {actual_ids}") items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) self.added_tokens_list = [text for (text, idx) in items] self.vocab_size_base = vocab_size self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]: tokenizer = self.sentencepiece_tokenizer if self.vocabtype == "bpe": from transformers.models.gpt2 import tokenization_gpt2 byte_encoder = tokenization_gpt2.bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} for i, item in enumerate(tokenizer): text: bytes text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]]) score: float = -i yield text, score else: for i in range(tokenizer.vocab_size()): text: bytes if tokenizer.is_unknown(i): text = " \u2047 ".encode("utf-8") elif tokenizer.is_control(i): text = b"" elif tokenizer.is_byte(i): piece = tokenizer.id_to_piece(i) if len(piece) != 6: invalidInputError(False, f"Invalid token: {piece}") byte_value = int(piece[3:-1], 16) text = struct.pack("B", byte_value) else: text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") score: float = tokenizer.get_score(i) yield text, score def added_tokens(self) -> Iterable[Tuple[bytes, float]]: for text in self.added_tokens_list: score = -1000.0 yield text.encode("utf-8"), score def all_tokens(self) -> Iterable[Tuple[bytes, float]]: yield from self.sentencepiece_tokens() yield from self.added_tokens() def __repr__(self) -> str: return f"" class GGMLVocab: def __init__(self, tokens: List[Tuple[bytes, float]]): self.tokens = tokens self.vocab_size = len(tokens) def all_tokens(self) -> Iterable[Tuple[bytes, float]]: return self.tokens def __repr__(self) -> str: return f"" Vocab = Union[SentencePieceVocab, GGMLVocab] def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape)) def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray: # First reinterpret each row from a list of int32s containing 8 values each # to a list of uint8s containing 2 values each. qvalues_pack8 = qvalues_pack32.view(np.uint8) # Then split out the two values per int8 (which requires an actual # conversion because numpy doesn't natively support int4s). qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8) qvalues[:, 0::2] = qvalues_pack8 & 0xf qvalues[:, 1::2] = qvalues_pack8 >> 4 invalidInputError(addends is None or addends.shape == scales.shape, "Fail during dequantization because addends and scales dismatch.") invalidInputError(qvalues.shape[0] == scales.shape[0] and qvalues.shape[1] % scales.shape[1] == 0, "Fail during dequantization because qvalues and scales dismatch.") if g_idx is None: repeat_count = qvalues.shape[1] // scales.shape[1] scales = scales[:, :, np.newaxis] if addends is not None: addends = addends[:, :, np.newaxis] # Reshape so that the below computation broadcasts over scales and addends: qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count)) else: # In this case the scale and addend is selected for each column by g_idx: invalidInputError(addends is not None, "The addend is selected for each column by g_idx, but got None.") scales = scales[:, g_idx] addends = addends[:, g_idx] if addends is None: # Q4_0 qvalues = qvalues.view(np.int8) qvalues -= 8 # And do the actual 'value = scale * qvalue + addend' computation. values = scales * qvalues if addends is not None: values += addends if g_idx is None: values.shape = (values.shape[0], values.shape[1] * values.shape[2]) return values class Tensor(metaclass=ABCMeta): data_type: DataType @abstractmethod def astype(self, data_type: DataType) -> 'Tensor': pass @abstractmethod def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': pass @abstractmethod def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': pass @abstractmethod def part(self, n_part: int) -> 'UnquantizedTensor': pass @abstractmethod def to_ggml(self) -> 'GGMLCompatibleTensor': pass def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray: invalidInputError(bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}.") fp32_arr = bf16_arr.astype(np.uint32) << 16 return fp32_arr.view(np.float32) class UnquantizedTensor(Tensor): def __init__(self, ndarray: NDArray) -> None: self.ndarray = ndarray self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype] def astype(self, data_type: DataType) -> Tensor: dtype = DATA_TYPE_TO_NUMPY[data_type] if self.data_type == DT_BF16: self.ndarray = bf16_to_fp32(self.ndarray) return UnquantizedTensor(self.ndarray.astype(dtype)) def to_ggml(self) -> 'UnquantizedTensor': return self def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': r = self.ndarray.shape[0] // 3 return UnquantizedTensor(permute(self.ndarray[r * n_part: r * n_part + r, ...], n_head)) def part(self, n_part: int) -> 'UnquantizedTensor': r = self.ndarray.shape[0] // 3 return UnquantizedTensor(self.ndarray[r * n_part: r * n_part + r, ...]) def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor': return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head)) def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray: tensor = lazy_tensor.load() # double-check: actual_shape = list(tensor.ndarray.shape) if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype: if convert: tensor.ndarray = tensor.ndarray.astype(expected_dtype) else: invalidInputError(False, f'Expected this tensor to have dtype {expected_dtype},' f' but got {tensor.ndarray.dtype}.') return tensor.ndarray class GGMLQuantizedTensor(Tensor): data_type: QuantizedDataType def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None: rows, columns = shape invalidInputError(columns % data_type.groupsize == 0, "Initialization of GGMLQuantizedTensor failed.") words_in_block = 6 if data_type == DT_Q4_1 else 5 self.ndarray = ndarray.view(dtype=np.uint32) \ .reshape((rows, columns // data_type.groupsize, words_in_block)) self.shape = shape[:] self.data_type = data_type def astype(self, data_type: DataType) -> Tensor: if data_type == self.data_type: return self scales = self.ndarray[:, :, 0].view(np.float32) if self.data_type.have_addends: addends = self.ndarray[:, :, 1].view(np.float32) else: addends = None qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8]) dq = dequantize_q4(qweights, scales, addends, g_idx=None) return UnquantizedTensor(dq).astype(data_type) def to_ggml(self) -> 'GGMLQuantizedTensor': return self def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor': return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type) def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': r = self.ndarray.shape[0] // 3 return UnquantizedTensor(permute(self.ndarray[r * n_part: r * n_part + r, ...], n_head)) def part(self, n_part: int) -> 'UnquantizedTensor': r = self.ndarray.shape[0] // 3 return UnquantizedTensor(self.ndarray[r * n_part: r * n_part + r, ...]) GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor] class DeferredPermutedTensor(Tensor): def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None: self.base = base self.n_head = n_head self.n_kv_head = n_kv_head self.data_type = self.base.data_type def astype(self, data_type: DataType) -> Tensor: return self.base.astype(data_type).permute(self.n_head, self.n_kv_head) def to_ggml(self) -> GGMLCompatibleTensor: return self.base.to_ggml().permute(self.n_head, self.n_kv_head) def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor: invalidInputError(False, "Shouldn't permute twice.") class GPTQForLLaMaQuantizedTensor(Tensor): def __init__(self, model: 'LazyModel', namebase: str) -> None: qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32) scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True) bias = model.get(f"{namebase}.bias") if bias is not None: invalidInputError(not np.any(load_unquantized(bias)), 'Q4_1 does not support bias') if f"{namebase}.zeros" in model: zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32) else: qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32) invalidInputError(qzeros.dtype == np.int32, 'Fail to initiate GPTQForLLaMaQuantizedTensor.') zeros = dequantize_q4(qzeros, scales, scales, g_idx=None) invalidInputError(zeros.dtype == np.float32, 'Fail to initiate GPTQForLLaMaQuantizedTensor.') invalidInputError(zeros.shape == scales.shape, 'Fail to initiate GPTQForLLaMaQuantizedTensor.') # Output is transposed compared to the input, and addends have their sign flipped. # Scales and zeros similarly must be transposed but only for newer # versions of GPTQ-for-LLaMa; the older versions can be identified by # having shape (n_embd, 1). qweight = qweight.T if scales.shape[1] != 1: scales = scales.T zeros = zeros.T # Output also has signs flipped for the addends. self.qweight = qweight self.scales = scales self.addends = -zeros self.g_idx = Optional[NDArray] if f"{namebase}.g_idx" in model: self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32) invalidInputError(self.g_idx.shape == (qweight.shape[1] * 8,), 'Fail to initiate GPTQForLLaMaQuantizedTensor.') else: self.g_idx = None self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8] self.data_type = QuantizedDataType(groupsize=self.groupsize(), have_addends=True, have_g_idx=(self.g_idx is not None)) def inspect(self, row: int, col: int) -> None: '''For debugging.''' qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xf if self.g_idx is not None: group = self.g_idx[col] else: group = int(col // self.groupsize()) scale = self.scales[row, group] addend = self.addends[row, group] with np.printoptions(precision=None, suppress=True): print(f'scale:{scale} addend:{addend} qweight:{qweight}') print('possible values:', np.arange(16) * scale + addend) print('actual value:', qweight * scale + addend) def astype(self, data_type: DataType) -> Tensor: if isinstance(data_type, QuantizedDataType): invalidInputError(self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False, "Fail to call `GPTQForLLaMaQuantizedTensor.astype`.") return self.regroup(data_type.groupsize) dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx) return UnquantizedTensor(dequantized).astype(data_type) def groupsize(self) -> int: invalidInputError(self.addends.shape == self.scales.shape and self.shape[1] % self.scales.shape[1] == 0, "Fail to call `GPTQForLLaMaQuantizedTensor.groupsize`.") return self.shape[1] // self.scales.shape[1] def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor': # Old versions of GPTQ-for-LLaMa shared scales and addends between all the # columns in a row. Newer versions share them between every set of N # columns in a row, where N is the `groupsize` parameter, usually 128. The # output format shares them between every set of 32 columns. To handle # this, duplicate scales and addends for every smaller group. # (In the above, 'row' and 'column' are in the sense of the output.) invalidInputError(self.g_idx is None, "Fail to call `GPTQForLLaMaQuantizedTensor.regroup`.") old_groupsize = self.groupsize() invalidInputError(old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0 and old_groupsize, "Fail to call `GPTQForLLaMaQuantizedTensor.regroup`.") ret = copy.copy(self) ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1) ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1) ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False) return ret def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor: return DeferredPermutedTensor(self, n_head, n_kv_head) def to_ggml(self) -> GGMLQuantizedTensor: # The output format looks like this: # For each row: # For each group of 32 columns: # - addend (float32, 4 bytes) # - scale (float32, 4 bytes) # - weights (int4 * 32, 16 bytes) invalidInputError(self.groupsize() == 32, "Should have been regrouped before converting to ggml.") # Since the output format is mixed between integers and floats, we have # to hackily view the floats as int32s just so numpy will let us # concatenate them. addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis] scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis] # Split into groups of 4 columns (i.e. 32 columns of quantized data): grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4]) # And concatenate: grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no') return GGMLQuantizedTensor(grouped, self.shape, DT_Q4_1) @dataclass class LazyTensor: _load: Callable[[], Tensor] shape: List[int] data_type: DataType description: str def load(self) -> Tensor: ret = self._load() invalidInputError(ret.data_type == self.data_type and (self.data_type, ret.data_type, self.description), "Fail to load `LazyTensor`.") return ret def astype(self, data_type: DataType) -> 'LazyTensor': self.validate_conversion_to(data_type) def load() -> Tensor: return self.load().astype(data_type) return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') def validate_conversion_to(self, data_type: DataType) -> None: if data_type == self.data_type: return if isinstance(data_type, QuantizedDataType): invalidInputError(isinstance(self.data_type, QuantizedDataType), "Can't turn an unquantized tensor into" f" a quantized type ({data_type}).") if self.data_type.have_g_idx: sys.stderr.write( "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), " "which is not yet natively supported by GGML. For now " "you can still convert this model by passing `--outtype f16` to dequantize, " "but that will result in a much larger output file for no quality benefit.\n") sys.exit(1) invalidInputError(not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends, "Fail to convert to expected data type.") LazyModel = Dict[str, LazyTensor] @dataclass class ModelPlus: model: LazyModel paths: List[Path] # Where this was read from. format: Literal['ggml', 'torch', 'safetensors'] vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab. def merge_sharded(models: List[LazyModel]) -> LazyModel: # Original LLaMA models have each file contain one part of each tensor. # Use a dict instead of a set to preserve order. names = {name: None for model in models for name in model} def convert(name: str) -> LazyTensor: lazy_tensors = [model[name] for model in models] if len(lazy_tensors) == 1: # only one file; don't go through this procedure since there might # be quantized tensors return lazy_tensors[0] if len(lazy_tensors[0].shape) == 1: # the tensor is just duplicated in every file return lazy_tensors[0] if name.startswith('tok_embeddings.') or \ name.endswith('.attention.wo.weight') or \ name.endswith('.feed_forward.w2.weight'): # split by columns axis = 1 else: # split by rows axis = 0 concatenated_shape = list(lazy_tensors[0].shape) concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors) def load() -> UnquantizedTensor: ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors] concatenated = np.concatenate(ndarrays, axis=axis) return UnquantizedTensor(concatenated) description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]' return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description) return {name: convert(name) for name in names} def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus: formats = set(mp.format for mp in models_plus) invalidInputError(len(formats) == 1, "The input models are different formats.") format = formats.pop() paths = [path for mp in models_plus for path in mp.paths] # Use the first non-None vocab, if any. try: vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None) except StopIteration: vocab = None if any("model.embed_tokens.weight" in mp.model for mp in models_plus): # Transformers models put different tensors in different files, but # don't split indivdual tensors between files. model = LazyModel model = {} for mp in models_plus: model.update(mp.model) else: model = merge_sharded([mp.model for mp in models_plus]) return ModelPlus(model, paths, format, vocab) def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().permute(n_head, n_kv_head) return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description) def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().permute_part(n_part, n_head) s = lazy_tensor.shape.copy() s[0] = s[0] // 3 return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description) def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().part(n_part) s = lazy_tensor.shape.copy() s[0] = s[0] // 3 return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description) def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel: out = {} out["tok_embeddings.weight"] = model["model.embed_tokens.weight"] out["norm.weight"] = model["model.norm.weight"] out["output.weight"] = model["lm_head.weight"] for i in itertools.count(): if f"model.layers.{i}.self_attn.q_proj.weight" in model: out[f"layers.{i}.attention.wq.weight"] = \ permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head) out[f"layers.{i}.attention.wk.weight"] = \ permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head) out[f"layers.{i}.attention.wv.weight"] = \ model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: out[f"layers.{i}.attention.wq.weight"] = \ permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head) out[f"layers.{i}.attention.wk.weight"] = \ permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head) out[f"layers.{i}.attention.wv.weight"] = \ part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) else: break out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"] out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"] out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"] out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"] out[f"layers.{i}.attention_norm.weight"] = \ model[f"model.layers.{i}.input_layernorm.weight"] out[f"layers.{i}.ffn_norm.weight"] = \ model[f"model.layers.{i}.post_attention_layernorm.weight"] return out def handle_quantization(model: LazyModel) -> LazyModel: '''Convert a model with entries for 'foo.qweight', 'foo.scales', etc. (which resolve to UnquantizedTensors with the raw data) to one with entries for 'foo.weight' (which resolve to QuantizedTensors). ''' def convert(name: str) -> Tuple[str, LazyTensor]: if name.endswith(".qweight"): namebase = name.rsplit('.', 1)[0] orig_name = namebase + ".weight" lazy_tensor = model[name] invalidInputError(len(lazy_tensor.shape) == 2, "Fail to convert a model with entries for 'foo.qweight'.") real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8] # Calculate type. This replicates the logic in # GPTQForLLaMaQuantizedTensor (which is executed when the modelis # actually loaded). lazy_scales = model[f"{namebase}.scales"] scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0] invalidInputError(real_shape[1] % scales_width == 0, "Fail to convert a model with entries for 'foo.qweight'.") groupsize = real_shape[1] // scales_width have_g_idx = f"{namebase}.g_idx" in model data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx) def load() -> Tensor: return GPTQForLLaMaQuantizedTensor(model, namebase) return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]')) else: return (name, model[name]) return dict(convert(name) for name in model) # Functionality that simulates `torch.load` but where individual tensors are # only loaded into memory on demand, not all at once. # PyTorch can't do this natively as of time of writing: # - https://github.com/pytorch/pytorch/issues/64327 # This allows us to de-shard without multiplying RAM usage, and also # conveniently drops the PyTorch dependency (though we still need numpy). @dataclass class LazyStorageKind: data_type: DataType @dataclass class LazyStorage: load: Callable[[int, int], NDArray] kind: LazyStorageKind description: str class LazyUnpickler(pickle.Unpickler): def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile): super().__init__(fp) self.data_base_path = data_base_path self.zip_file = zip_file def persistent_load(self, pid: Any) -> Any: invalidInputError(pid[0] == 'storage' and isinstance(pid[1], LazyStorageKind), "Fail to load.") data_type = pid[1].data_type filename_stem = pid[2] filename = self.data_base_path + '/' + filename_stem info = self.zip_file.getinfo(filename) def load(offset: int, elm_count: int) -> NDArray: dtype = DATA_TYPE_TO_NUMPY.get(data_type) invalidInputError(dtype is not None, "Tensor stored in unsupported format.") fp = self.zip_file.open(info) fp.seek(offset * dtype.itemsize) size = elm_count * dtype.itemsize data = fp.read(size) invalidInputError(len(data) == size, "Fail to load.") return np.frombuffer(data, dtype) description = f'storage data_type={data_type} path-in-zip={filename}' + \ f' path={self.zip_file.filename}' return LazyStorage(load=load, kind=pid[1], description=description) # @staticmethod def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName] requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: invalidInputError(isinstance(storage, LazyStorage), "Fail to rebuild `LazyTensor`.") def load() -> UnquantizedTensor: elm_count = stride[0] * size[0] return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size)) description = f'pickled storage_offset={storage_offset} in {storage.description}' return LazyTensor(load, list(size), storage.kind.data_type, description) def rebuild_from_type_v2(func, new_type, args, state): return func(*args) CLASSES = { ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2, ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2, ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), ('torch', 'IntStorage'): LazyStorageKind(DT_I32), ('torch', 'Tensor'): LazyTensor, } def find_class(self, module: str, name: str) -> Any: if not module.startswith('torch'): return super().find_class(module, name) return self.CLASSES[(module, name)] def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: zf = zipfile.ZipFile(outer_fp) pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')] invalidInputError(len(pickle_paths) == 1 and pickle_paths is not None, "Fail to load torch files.") pickle_fp = zf.open(pickle_paths[0], 'r') unpickler = LazyUnpickler(pickle_fp, data_base_path=pickle_paths[0][:-4], zip_file=zf) model = unpickler.load() as_dict = dict(model.items()) return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None) SAFETENSORS_DATA_TYPES = { 'BF16': DT_BF16, 'F16': DT_F16, 'F32': DT_F32, 'I32': DT_I32, } def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: header_size, = struct.unpack(' LazyTensor: data_type = SAFETENSORS_DATA_TYPES[info['dtype']] numpy_dtype = DATA_TYPE_TO_NUMPY[data_type] shape = info['shape'] begin, end = info['data_offsets'] invalidInputError(0 <= begin <= end <= len(byte_buf) and end - begin == math.prod(shape) * numpy_dtype.itemsize, "Fail to load safetensors files.") buf = byte_buf[begin:end] def load() -> UnquantizedTensor: return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) description = f'safetensors begin={begin} end={end} type={data_type} path={path}' return LazyTensor(load, shape, data_type, description) model = {name: convert(info) for (name, info) in header.items()} return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None) def must_read(fp: IO[bytes], length: int) -> bytes: ret = fp.read(length) invalidInputError(len(ret) >= length, "Unexpectedly reached end of file.") return ret def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus: magic = must_read(fp, 4)[::-1] if magic in (b'ggmf', b'ggjt'): version, = struct.unpack("i", must_read(fp, 4)) invalidInputError(version == 1, "Fail to load ggml files.") else: invalidInputError(magic == b'ggml', "Fail to load ggml files.") version = None n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = \ struct.unpack('<7i', must_read(fp, 28)) tokens = [] for i in range(n_vocab): if i == 32000: # HACK: GPT4All messed with the format without changing the magic # number. Specifically, they changed the vocab section to contain # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the # extra pad token). Try to detect if we're reading a file like # this. orig_pos = fp.tell() fp.seek(20, io.SEEK_CUR) is_gpt4all = fp.read(21) == b'tok_embeddings.weight' fp.seek(orig_pos) if is_gpt4all: break length, = struct.unpack("i", must_read(fp, 4)) text = must_read(fp, length) if magic != b'ggml': score, = struct.unpack("f", must_read(fp, 4)) tokens.append((text, score)) vocab = GGMLVocab(tokens) if magic != b'ggml' else None model = {} # Use mmap for the actual data to avoid race conditions with the file offset. off = fp.raw.tell() mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)) # needed on Windows fp.raw.seek(off) def read_tensor() -> None: # this is a function so that variables captured in `load` don't change shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12)) invalidInputError(0 <= shape_len <= 3, "Fail to read tensors.") shape = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len))) shape = shape[::-1] name = must_read(fp, name_len).decode('utf-8') data_type = FTYPE_TO_DATA_TYPE[ftype] if magic == b'ggjt': fp.seek((fp.tell() + 31) & -32) if data_type == DT_Q4_1: # See GPTQForLLaMaQuantizedTensor.ggml_ndarray() size = 24 * (shape[1] // 32) * shape[0] elif data_type == DT_Q4_0: size = 20 * (shape[1] // 32) * shape[0] else: numpy_dtype = DATA_TYPE_TO_NUMPY[data_type] elm_count = math.prod(shape) size = elm_count * numpy_dtype.itemsize offset = fp.tell() buf = mapped[offset:offset+size] fp.seek(size, io.SEEK_CUR) def load() -> Tensor: if isinstance(data_type, QuantizedDataType): ndarray = np.frombuffer(buf, dtype=np.uint32) return GGMLQuantizedTensor(ndarray, shape, data_type) else: return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) description = f'ggml offset={offset} type={data_type} path={path}' model[name] = LazyTensor(load, shape, data_type, description) while fp.read(1) != b'': fp.seek(-1, io.SEEK_CUR) read_tensor() return ModelPlus(model=model, paths=[path], format='ggml', vocab=vocab) @functools.lru_cache(maxsize=None) def lazy_load_file(path: Path) -> ModelPlus: fp = open(path, 'rb') first8 = fp.read(8) fp.seek(0) if first8[:2] == b'PK': # A zip file, i.e. PyTorch format return lazy_load_torch_file(fp, path) elif first8[2:4] == b'gg': # GGML format return lazy_load_ggml_file(fp, path) elif struct.unpack(' Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than letting results pile up in memory. Specifically, there is a max of one output value buffered per thread.''' with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] items_rev = list(iterable)[::-1] for i in range(min(concurrency, len(items_rev))): futures.append(executor.submit(func, items_rev.pop())) while futures: result = futures.pop(0).result() if items_rev: futures.append(executor.submit(func, items_rev.pop())) yield result def check_vocab_size(params: Params, vocab: Vocab) -> None: if params.n_vocab != vocab.vocab_size: # GGMLVocab comes from the same file as the model so shouldn't mismatch: invalidInputError(isinstance(vocab, SentencePieceVocab), "Vocab and SentencePieceVocab mismatch.") if params.n_vocab == vocab.vocab_size_base: print("Ignoring added_tokens.json since model matches vocab size without it.") vocab.added_tokens_list = [] vocab.vocab_size = vocab.vocab_size_base return msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" if vocab.fname_added_tokens is not None: msg += f" combined with {vocab.fname_added_tokens}" msg += f" has {vocab.vocab_size})." if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and \ vocab.fname_added_tokens is None: msg += " Most likely you are missing added_tokens.json," msg += f" which should be in {vocab.fname_tokenizer.parent})." invalidInputError(False, msg) class OutputFile: def __init__(self, fname_out: Path) -> None: self.fout = open(fname_out, "wb") def write_file_header(self, params: Params, file_type: GGMLFileType) -> None: self.fout.write(b"ggjt"[::-1]) # magic values = [ 1, # file version params.n_vocab, params.n_embd, params.n_mult, params.n_head, params.n_layer, params.n_embd // params.n_head, # rot (obsolete) file_type.value, ] self.fout.write(struct.pack("i" * len(values), *values)) def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None: sname = name.encode('utf-8') self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type])) self.fout.write(struct.pack("i" * len(shape), *shape[::-1])) self.fout.write(sname) self.fout.seek((self.fout.tell() + 31) & -32) def write_vocab(self, vocab: Vocab) -> None: for text, score in vocab.all_tokens(): self.fout.write(struct.pack("i", len(text))) self.fout.write(text) self.fout.write(struct.pack("f", score)) @staticmethod def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: of = OutputFile(fname_out) params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0) of = OutputFile(fname_out) of.write_file_header(params, file_type=GGMLFileType.AllF32) of.write_vocab(vocab) of.fout.close() @staticmethod def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out) of.write_file_header(params, file_type) print("Writing vocab...") of.write_vocab(vocab) def do_item(item: Tuple[str, LazyTensor]) -> NDArray: name, lazy_tensor = item return lazy_tensor.load().to_ggml().ndarray ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8) for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16}" f"| type {lazy_tensor.data_type}") of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type) ndarray.tofile(of.fout) of.fout.close() def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType: wq_type = model["layers.0.attention.wq.weight"].data_type if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)): return GGMLFileType.AllF32 if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16): return GGMLFileType.MostlyF16 if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) and wq_type.have_addends): if isinstance(model["output.weight"].data_type, QuantizedDataType): return GGMLFileType.MostlyQ4_1 else: return GGMLFileType.PerLayerIsQ4_1 if output_type_str == "q4_0" or \ (output_type_str is None and isinstance(wq_type, QuantizedDataType)): return GGMLFileType.MostlyQ4_0 name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} invalidInputError(False, f"Unexpected combination of types: {name_to_type}.") def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel: model = handle_quantization(model) if "lm_head.weight" in model: model = convert_transformers_to_orig(model, params) model = filter_and_sort_tensors(model) return model def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) for (name, tensor) in model.items()} def nth_multifile_path(path: Path, n: int) -> Optional[Path]: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the nth path in the model. ''' # Support the following patterns: patterns = [ # - x.00.pth, x.01.pth, etc. (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'), # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'), # x.bin, x.bin.1, etc. (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}') ] for regex, replacement in patterns: if re.search(regex, path.name): new_path = path.with_name(re.sub(regex, replacement, path.name)) if new_path.exists(): return new_path return None def find_multifile_paths(path: Path) -> List[Path]: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the whole list of paths in the model. ''' ret = [] for i in itertools.count(): nth_path = nth_multifile_path(path, i) if nth_path is None: break ret.append(nth_path) if not ret: # No matches. This should only happen if the file was named, e.g., # foo.0, and there was no file named foo. Oh well, try to process it # as a single file. return [path] return ret def load_some_model(path: Path) -> ModelPlus: '''Load a model of any supported format.''' # Be extra-friendly and accept either a file or a directory: if path.is_dir(): globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] files = [file for glob in globs for file in path.glob(glob)] if not files: # Try GGML too, but with lower priority, since if both a non-GGML # model and a GGML model exist in the same directory, we assume the # latter was converted from the former. files = list(path.glob("ggml-model*.bin*")) invalidInputError(files, f"Can't find model in directory {path}.") invalidInputError(len(files) == 1, f"Found multiple models in {path}, not sure which to pick: {files}.") path = files[0] paths = find_multifile_paths(path) models_plus = [] for path in paths: print(f"Loading model file {path}") models_plus.append(lazy_load_file(path)) model_plus = merge_multifile_models(models_plus) return model_plus def filter_and_sort_tensors(model: LazyModel) -> LazyModel: return {name: model[name] for name in TENSORS_LIST if name in model} def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab: # Be extra-friendly and accept either a file or a directory. Also, if it's # a directory, it might be the model directory, and tokenizer.model might # be in the parent of that. print(f"vocabtype: {vocabtype}") if path.is_dir(): vocab_file = "tokenizer.model" if vocabtype == 'bpe': vocab_file = "vocab.json" path2 = path / vocab_file # Use `.parent` instead of /.. to handle the symlink case better. path3 = path.parent / vocab_file if path2.exists(): path = path2 elif path3.exists(): path = path3 else: invalidInputError(False, f"Could not find tokenizer.model in {path} or its parent; " "if it's in another directory, pass the directory as --vocab-dir") added_tokens_path = path.parent / "added_tokens.json" print(f"Loading vocab file {path}") return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None, vocabtype) def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path: namestr = { GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", GGMLFileType.MostlyQ4_0: "q4_0", GGMLFileType.MostlyQ4_1: "q4_1", GGMLFileType.PerLayerIsQ4_1: "q4_1", }[file_type] ret = model_paths[0] / f"ggml-model-{namestr}.bin" if ret in model_paths: sys.stderr.write( f"Error: Default output path ({ret}) would overwrite the input. " "Please explicitly specify a path using --outfile.\n") sys.exit(1) return ret # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py def bytes_to_unicode(): """ Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ bs = list(range(ord("!"), ord("~")+1)) + list(range(ord("¡"), ord("¬")+1)) + \ list(range(ord("®"), ord("ÿ")+1)) cs = bs[:] n = 0 for b in range(2**8): if b not in bs: bs.append(b) cs.append(2**8+n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) def _convert_gptneox_hf_to_ggml(model_path, outfile_dir, outtype): from transformers import AutoModelForCausalLM, AutoTokenizer import torch tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16 if outtype == "f16" else torch.float32) model.eval() for p in model.parameters(): p.requires_grad = False hparams = model.config.to_dict() filestem = Path(model_path).stem fn_out = os.path.join(outfile_dir, f"ggml-{filestem}-{outtype}.bin") fout = open(fn_out, "wb") ggml_file_magic = 0x67676d66 # 0x67676d6c is unversioned ggml_file_version = 0x00000001 # v1 if outtype == "f16": ftype = 1 else: ftype = 0 hparams["multiple_of"] = 1 fout.write(struct.pack("i", ggml_file_magic)) # magic: ggmf in hex fout.write(struct.pack("i", ggml_file_version)) fout.write(struct.pack("i", hparams["vocab_size"])) fout.write(struct.pack("i", hparams["max_position_embeddings"])) fout.write(struct.pack("i", hparams["hidden_size"])) fout.write(struct.pack("i", hparams["num_attention_heads"])) fout.write(struct.pack("i", hparams["num_hidden_layers"])) fout.write(struct.pack("i", int((hparams["hidden_size"] / hparams["num_attention_heads"]) * hparams["rotary_pct"]))) # rotary_dim fout.write(struct.pack("i", int(hparams["use_parallel_residual"]))) fout.write(struct.pack("i", ftype)) dot_token = tokenizer.encode(".")[0] vocab = tokenizer.vocab id2token = {v: k for k, v in vocab.items()} for i in range(hparams["vocab_size"]): if i in id2token: text = id2token[i].encode('utf-8') else: text = tokenizer.decode([i]).encode('utf-8') fout.write(struct.pack("i", len(text))) fout.write(text) list_vars = model.state_dict() for name in list_vars.keys(): if name.startswith('gpt_neox.layers.'): if 'attention.masked_bias' in name or 'attention.rotary_emb.inv_freq' in name or \ 'attention.bias' in name: continue # No gradients for these list_vars[name].requires_grad = False src = name nn = name data = list_vars[src].squeeze().numpy() data = data.astype(np.float32) n_dims = len(data.shape) # default type is fp32 ftype_cur = 0 if ftype == 1 and n_dims > 1: data = data.astype(np.float16) ftype_cur = 1 else: data = data.astype(np.float32) # header str = name.encode('utf-8') fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) for i in range(n_dims): fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) fout.write(str) # data data.tofile(fout) fout.close() def _convert_bloom_hf_to_ggml(model_path, outfile_dir, outtype): from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig import torch conv_map = {'word_embeddings': 'tok_embeddings', 'word_embeddings_layernorm': 'norm', 'input_layernorm': 'attention_norm', 'self_attention.query_key_value': 'attention.query_key_value', 'self_attention.dense': 'attention.wo', 'post_attention_layernorm': 'ffn_norm', 'mlp.dense_h_to_4h': 'feed_forward.w1', 'mlp.dense_4h_to_h': 'feed_forward.w2', 'ln_f': 'output_norm', 'lm_head': 'output', } tokenizer = AutoTokenizer.from_pretrained(model_path) config = AutoConfig.from_pretrained(model_path) hparams = config.to_dict() model = AutoModelForCausalLM.from_pretrained(model_path, config=config, torch_dtype=torch.float16 if outtype == "f16" else torch.float32, low_cpu_mem_usage=True) filestem = Path(model_path).stem fn_out = os.path.join(outfile_dir, f"ggml-{filestem}-{outtype}.bin") fout = open(fn_out, "wb") if outtype == "f16": ftype = 1 else: ftype = 0 hparams["multiple_of"] = 1 fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex fout.write(struct.pack("i", hparams["vocab_size"])) # fout.write(struct.pack("i", hparams["seq_length"])) fout.write(struct.pack("i", hparams["hidden_size"])) fout.write(struct.pack("i", hparams["multiple_of"])) fout.write(struct.pack("i", hparams["n_head"])) fout.write(struct.pack("i", hparams["n_layer"])) fout.write(struct.pack("i", ftype)) dot_token = tokenizer.encode(".")[0] for i in range(hparams["vocab_size"]): text = tokenizer.decode([i]).encode('utf-8') fout.write(struct.pack("i", len(text))) fout.write(text) list_vars = model.state_dict() for name in list_vars.keys(): src = name nn = name if name != "lm_head.weight": nn = nn.split(".")[1:] else: nn = nn.split(".") if nn[0] == "h": nn[0] = "layers" mapped = conv_map[".".join(nn[2:-1])] name = ".".join(nn[:2] + [mapped] + nn[-1:]) else: mapped = conv_map[".".join(nn[:-1])] name = ".".join([mapped] + nn[-1:]) if "query_key_value" in src: q, k, v = list_vars[src].reshape(config.n_head, 3, -1).unbind(1) list_vars[src] = torch.cat([q, k, v], dim=0).reshape_as(list_vars[src]) data = list_vars[src].squeeze().numpy() data = data.astype(np.float32) n_dims = len(data.shape) # default type is fp32 ftype_cur = 0 if ftype == 1 and n_dims > 1: data = data.astype(np.float16) ftype_cur = 1 # header str = name.encode('utf-8') fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) for i in range(n_dims): fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) fout.write(str) # data data.tofile(fout) fout.close() def _convert_starcoder_hf_to_ggml(model_path, outfile_dir, outtype): from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig import torch tokenizer = AutoTokenizer.from_pretrained(model_path) config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) hparams = config.to_dict() model = AutoModelForCausalLM.from_pretrained(model_path, config=config, torch_dtype=torch.float16 if outtype == "f16" else torch.float32, # low_cpu_mem_usage=True, trust_remote_code=True, offload_state_dict=True) list_vars = model.state_dict() encoder = tokenizer.vocab # Add added_tokens (special tokens) to the encoder encoder.update(tokenizer.get_added_vocab()) filestem = Path(model_path).stem fn_out = os.path.join(outfile_dir, f"ggml-{filestem}-{outtype}.bin") fout = open(fn_out, "wb") if outtype == "f16": ftype = 1 else: ftype = 0 fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex vocab_size = hparams["vocab_size"] fout.write(struct.pack("i", vocab_size)) # fout.write(struct.pack("i", len(encoder))) fout.write(struct.pack("i", hparams["n_positions"])) fout.write(struct.pack("i", hparams["n_embd"])) fout.write(struct.pack("i", hparams["n_head"])) fout.write(struct.pack("i", hparams["n_layer"])) fout.write(struct.pack("i", ftype)) byte_encoder = bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} fout.write(struct.pack("i", vocab_size)) counter = 0 # sort by value for key in sorted(encoder, key=encoder.get): text = bytearray([byte_decoder[c] for c in key]) fout.write(struct.pack("i", len(text))) fout.write(text) counter += 1 # TODO: Repeat last token until vocab_size while counter < vocab_size: fout.write(struct.pack("i", len(text))) fout.write(text) counter += 1 for name in list_vars.keys(): data = list_vars[name].squeeze().numpy() print("Processing variable: " + name + " with shape: ", data.shape) # rename headers to keep compatibility if name == "transformer.ln_f.weight": name = "model/ln_f/g" elif name == "transformer.ln_f.bias": name = "model/ln_f/b" elif name == "transformer.wte.weight": name = "model/wte" elif name == "transformer.wpe.weight": name = "model/wpe" elif name == "lm_head.weight": name = "model/lm_head" elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/ln_1/g" elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/ln_1/b" elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/attn/c_attn/w" elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/attn/c_attn/b" elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/attn/c_proj/w" elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/attn/c_proj/b" elif re.match(r"transformer.h.\d+.ln_2.weight", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/ln_2/g" elif re.match(r"transformer.h.\d+.ln_2.bias", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/ln_2/b" elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/mlp/c_fc/w" elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/mlp/c_fc/b" elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/mlp/c_proj/w" elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name): i = re.findall("\d+", name)[0] name = f"model/h{i}/mlp/c_proj/b" else: print("Unrecognized variable name. %s", name) # we don't need these if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"): print(" Skipping variable: " + name) continue n_dims = len(data.shape) ftype_cur = 0 if ftype == 1: if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2: print(" Converting to float16") data = data.astype(np.float16) ftype_cur = 1 else: print(" Converting to float32") data = data.astype(np.float32) ftype_cur = 0 "model/h.*/attn/c_attn/w" "model/h.*/attn/c_proj/w" "model/h.*/mlp/c_fc/w" "model/h.*/mlp/c_proj/w" if name[-14:] == "/attn/c_attn/w" or name[-14:] == "/attn/c_attn/b": print(" Duplicate K,V heads to use MHA instead of MQA") embed_dim = hparams["n_embd"] head_dim = embed_dim // hparams["n_head"] # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim) q, k, v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0) # duplicate k, v along the first axis (head_dim, hidden_dim) -> # (n_heads * head_dim, hidden_dim) if len(k.shape) == 2: k = np.tile(k, (hparams["n_head"], 1)) v = np.tile(v, (hparams["n_head"], 1)) elif len(k.shape) == 1: k = np.tile(k, (hparams["n_head"])) v = np.tile(v, (hparams["n_head"])) # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> # (3 * n_heads * head_dim, hidden_dim) data = np.concatenate((q, k, v), axis=0) # header str = name.encode('utf-8') fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) for i in range(n_dims): fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) fout.write(str) # data data.tofile(fout) fout.close() def _convert_chatglm_hf_to_ggml(model_path, outfile_dir, outtype): filestem = Path(model_path).stem outfile = os.path.join(outfile_dir, f"bigdl_llm_chatglm_{outtype}.bin") invalidInputError(outtype in ["q4_0", "q4_1"], "For now we only support quantization type 'q4_0' and 'q4_1' " "in chatglm family.") from ipex_llm.utils.convert_chatglm import _convert_chatglm_hf_to_ggml_ return _convert_chatglm_hf_to_ggml_(model_path, outfile, outtype)