ipex-llm/python/llm/src/ipex_llm/transformers/utils.py

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Some parts of this file is adapted from
# https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py
# which is licensed under the MIT license:
#
# MIT License
#
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import os
from transformers.modeling_utils import _add_variant
from ipex_llm.ggml.quantize import ggml_tensor_qtype, gguf_mixed_qtype
from ..utils.common import invalidInputError
from typing import Union, Optional
import torch
from torch import nn
import logging
import numpy as np


logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)


WEIGHTS_NAME = "pytorch_model.bin"
WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"


def extract_local_archive_file(pretrained_model_name_or_path, subfolder, variant=None):
    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
    if os.path.isfile(
        os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant))
    ):
        # Load from a PyTorch checkpoint
        archive_file = os.path.join(
            pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant)
        )
        return archive_file, False
    elif os.path.isfile(
        os.path.join(pretrained_model_name_or_path,
                     subfolder,
                     _add_variant(WEIGHTS_INDEX_NAME, variant))
    ):
        # Load from a sharded PyTorch checkpoint
        archive_file = os.path.join(
            pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)
        )
        is_sharded = True
        return archive_file, is_sharded
    else:
        invalidInputError(False,
                          f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}"
                          " found in directory"
                          f" {pretrained_model_name_or_path}.")


def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
    try:
        return torch.load(checkpoint_file, map_location="cpu")
    except Exception as e:
        invalidInputError(False,
                          f"Unable to load weights"
                          "from pytorch checkpoint file for '{checkpoint_file}' "
                          f"at '{checkpoint_file}'. ")


# PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
# so we need to apply the function recursively.
def load(module: nn.Module, state_dict, prefix=""):
    args = (state_dict, prefix, {}, True, [], [], [])
    # Parameters of module and children will start with prefix.
    # We can exit early if there are none in this state_dict
    if len([key for key in state_dict if key.startswith(prefix)]) > 0:
            module._load_from_state_dict(*args)

    for name, child in module._modules.items():
        if child is not None:
            load(child, state_dict, prefix + name + ".")


def get_local_shard_files(pretrained_model_name_or_path, index_filename, subfolder=""):
    import json

    invalidInputError(os.path.isfile(index_filename),
                      "Can't find a checkpoint index"
                      f" ({index_filename}) in {pretrained_model_name_or_path}.")

    with open(index_filename, "r") as f:
        index = json.loads(f.read())

    shard_filenames = sorted(set(index["weight_map"].values()))
    sharded_metadata = index["metadata"]
    sharded_metadata["all_checkpoint_keys"] = list(index["weight_map"].keys())
    sharded_metadata["weight_map"] = index["weight_map"].copy()

    shard_filenames = [os.path.join(pretrained_model_name_or_path, subfolder, f)
                       for f in shard_filenames]
    return shard_filenames, sharded_metadata


def fix_key(key):
    if "beta" in key:
        return key.replace("beta", "bias")
    if "gamma" in key:
        return key.replace("gamma", "weight")
    return key


def get_autocast_dtype(x):
    if x.device.type == "xpu":
        if torch.xpu.is_autocast_xpu_enabled():
            return torch.xpu.get_autocast_xpu_dtype()
        else:
            return None
    elif x.device.type == "cpu":
        if torch.is_autocast_cpu_enabled():
            return torch.get_autocast_cpu_dtype()
        else:
            return None
    else:
        invalidInputError(False,
                          f"Device {x.device} is not supported.")


_ipex_version = None


def get_ipex_version():

    global _ipex_version
    if _ipex_version is not None:
        return _ipex_version

    import intel_extension_for_pytorch as ipex
    _ipex_version = ipex.__version__
    return _ipex_version


def get_xpu_device_type(x):
    if x.device.type != "xpu":
        return x.device.type
    name = torch.xpu.get_device_name(x.device.index)
    if name.startswith("Intel(R) Arc(TM) A"):
        return "arc"
    elif name.startswith("Intel(R) Arc(TM)"):
        if 'V' in name:
            return "lnl"
        else:
            return "mtl"
    elif name.startswith("Intel(R) Data Center GPU Flex"):
        return "flex"
    elif name.startswith("Intel(R) Data Center GPU Max"):
        return "pvc"
    elif name.startswith("Intel(R) UHD"):
        return "uhd"
    else:
        return "others"


def load_imatrix_data(imatrix_file):
    # this function is adapted from https://github.com/ggerganov/llama.cpp/blob/
    # c82d18e863fcde91b4b1109b1d0c73ea4470c405/examples/quantize/quantize.cpp#L102
    imatrix = open(imatrix_file, 'rb')
    n_entries = imatrix.read(4)
    n_entries = int.from_bytes(n_entries, 'little')
    invalidInputError(n_entries >= 1,
                      f"failed reading name for entry from {imatrix_file}")
    imatrix_data = {}
    for i in range(n_entries):
        cur_len = imatrix.read(4)
        cur_len = int.from_bytes(cur_len, 'little')
        cur_name = str(imatrix.read(cur_len), encoding='utf-8')
        # cur_name looks like blk.14.attn_output.weight for llama / mistral,
        # cur_name looks like blk.0.ffn_down.3.weight for mixtral and
        # blk.17.ffn_gate_inp.weight for mixtral
        name_list = cur_name.split('.')
        layer = name_list[1]
        module_name = name_list[2]
        exp_id = None
        if 'ffn' in module_name and len(name_list) == 4:
            module_name = module_name[4:]  # from ffn_gate to gate
        elif 'ffn' in module_name and len(name_list) == 5:
            # mixtral's mlp layer
            module_name = module_name[4:]
            exp_id = name_list[3]
        elif 'attn' in module_name:
            module_name = module_name[5]  # from attn_k to k, attn_output to o
        module_name = layer + '_' + module_name
        if exp_id is not None:
            module_name += '_' + exp_id
        ncall = imatrix.read(4)
        ncall = int.from_bytes(ncall, 'little')
        nval = imatrix.read(4)
        nval = int.from_bytes(nval, 'little')
        invalidInputError(nval >= 1,
                          f"failed reading number of values for entry {i}")
        byte_data = imatrix.read(4 * nval)
        idata = np.frombuffer(byte_data, dtype=np.float32)

        if ncall > 0:
            idata = idata / ncall
        imatrix_data[module_name] = torch.from_numpy(idata).float()

    print(f"loaded {len(imatrix_data)} importance matrix entries from {imatrix_file}.")
    return imatrix_data


def module_name_process(full_module_name):
    # full name maybe model.layers.31.self_attn.o_proj for llama/mistral
    # full name maybe model.layers.0.block_sparse_moe.gate or
    # model.layers.0.block_sparse_moe.experts.0.w1 for mixtral
    module_name_list = full_module_name.split('.')
    if len(module_name_list) >= 5:
        super_module_name = module_name_list[3]
    else:
        super_module_name = None
    exp_id = None
    if super_module_name == 'block_sparse_moe':
        # handle mixtral moe here
        moe_mapping = {"w1": "gate", "w2": "down", "w3": "up"}
        layer = module_name_list[2]
        if len(module_name_list) == 5 and module_name_list[-1] == 'gate':
            cur_module = 'gate_inp'  # mapping with imatrix
        elif len(module_name_list) == 7:
            exp_id = module_name_list[-2]
            cur_module = module_name_list[-1]
            cur_module = moe_mapping[cur_module]
        new_module_name = '_'.join([layer, cur_module])
        if exp_id is not None:
            new_module_name += '_' + exp_id
    else:
        if len(module_name_list) == 5:
            layer = module_name_list[2]
            cur_module = module_name_list[-1][:-5]
            new_module_name = '_'.join([layer, cur_module])
        elif len(module_name_list) == 1:
            new_module_name = module_name_list[0]
            layer = None
            cur_module = None
    return new_module_name, layer, cur_module


def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_config=None):
    cur_qtype = qtype
    cur_imatrix = None
    if model_config is not None:
        model_type = getattr(model_config, "model_type", None)
    else:
        model_dtype = None

    if qtype in [ggml_tensor_qtype["gguf_iq2_xxs"], ggml_tensor_qtype["gguf_iq2_xs"],
                 ggml_tensor_qtype["gguf_iq1_s"]]:
        # For quantization which needs importance matrix
        new_module_name, layer, cur_module = module_name_process(full_module_name)
        # custom mixed quantization strategy
        if model_type == "mixtral":
            if cur_module == 'v':
                # llama.cpp use q4_K here
                cur_qtype = ggml_tensor_qtype['sym_int4']
            elif cur_module == 'down' and int(layer) in [0, 1, 2, 3]:
                cur_qtype = ggml_tensor_qtype['q2_k']
        else:
            num_hidden_layers = getattr(model_config, "num_hidden_layers", None)
            hidden_size = getattr(model_config, "hidden_size", None)
            if model_type == "llama" and hidden_size == 8192:
                # for llama2-70b
                if cur_module == 'v':
                    cur_qtype = ggml_tensor_qtype['sym_int4']  # llama.cpp use q4k here
                if cur_module == 'down' and int(layer) < int(num_hidden_layers/8):
                    cur_qtype = ggml_tensor_qtype['q2_k']
            elif cur_module == 'v' or (cur_module == 'down' and int(layer) in [0, 1, 10, 11]):
                cur_qtype = ggml_tensor_qtype['q2_k']
            if qtype == ggml_tensor_qtype["gguf_iq1_s"] and cur_module == 'o':
                cur_qtype = ggml_tensor_qtype['gguf_iq2_xxs']
        if imatrix_data is not None and new_module_name in imatrix_data:
            cur_imatrix = imatrix_data[new_module_name]
        else:
            # if no imatrix is available, use sym_int8 for lm_head
            cur_imatrix = None
            if new_module_name == 'lm_head':
                cur_qtype = ggml_tensor_qtype['sym_int8']
    elif qtype == ggml_tensor_qtype["q2_k"]:
        new_module_name, layer, cur_module = module_name_process(full_module_name)
        if cur_module == 'v' or (cur_module == 'down' and int(layer) in [0, 1, 10, 11]):
            # TODO: q2_k need others k-quants type here
            cur_qtype = ggml_tensor_qtype['q2_k']
        if imatrix_data is not None and new_module_name in imatrix_data:
            cur_imatrix = imatrix_data[new_module_name]
        else:
            # if no imatrix is available, use sym_int8 for lm_head
            cur_imatrix = None
            if new_module_name == 'lm_head':
                cur_qtype = ggml_tensor_qtype['sym_int8']
    elif qtype > 100:
        # gguf mixed precision
        new_module_name, layer, cur_module = module_name_process(full_module_name)
        num_hidden_layers = getattr(model_config, "num_hidden_layers", None)
        if qtype in [gguf_mixed_qtype["gguf_q4k_s"], gguf_mixed_qtype["gguf_q4k_m"]] and \
                new_module_name == 'lm_head':
            cur_qtype = ggml_tensor_qtype['q6_k']
        elif qtype == gguf_mixed_qtype["gguf_q4k_m"]:
            if int(layer) < int(num_hidden_layers/2) and cur_module in ['v', 'down']:
                cur_qtype = ggml_tensor_qtype['q6_k']
            else:
                cur_qtype = ggml_tensor_qtype['q4_k']
        elif qtype == gguf_mixed_qtype["gguf_q4k_s"]:
            if int(layer) < int(num_hidden_layers/8) and cur_module in ['v', 'down']:
                cur_qtype = ggml_tensor_qtype['q5_k']
            else:
                cur_qtype = ggml_tensor_qtype['q4_k']
    else:
        pass
    return cur_qtype, cur_imatrix


def get_modelscope_hf_config(model_id_or_path: str,
                             revision: Optional[str] = None):
    # Read hf config dictionary from modelscope hub or local path
    from modelscope.utils.constant import ModelFile
    from modelscope.hub.file_download import model_file_download
    from modelscope.utils.config import Config
    if not os.path.exists(model_id_or_path):
        local_path = model_file_download(
            model_id_or_path, ModelFile.CONFIG, revision=revision)
    elif os.path.isdir(model_id_or_path):
        local_path = os.path.join(model_id_or_path, ModelFile.CONFIG)
    elif os.path.isfile(model_id_or_path):
        local_path = model_id_or_path
    return Config._file2dict(local_path)


def is_torch_bf16_gpu_available():
    # always true for XPU and CPU
    return True


def check_hidden_size(qtype, hidden_size):
    if hidden_size % 256 != 0:
        if qtype == ggml_tensor_qtype["q4_k"]:
            logger.info(f"hidden size {hidden_size} is not divisible by 256, "
                        "required for q4_k - using fallback quantization asym_int4.")
            return ggml_tensor_qtype["asym_int4"]
        elif qtype == ggml_tensor_qtype["q5_k"]:
            logger.info(f"hidden size {hidden_size} is not divisible by 256, "
                        "required for q5_k - using fallback quantization asym_int5.")
            return ggml_tensor_qtype["asym_int5"]
        elif qtype == ggml_tensor_qtype["q6_k"]:
            logger.info(f"hidden size {hidden_size} is not divisible by 256, "
                        "required for q6_k - using fallback quantization sym_int8.")
            return ggml_tensor_qtype["sym_int8"]
        elif qtype == ggml_tensor_qtype["fp6_k"]:
            logger.info(f"hidden size {hidden_size} is not divisible by 256, "
                        "required for fq6_k - using fallback quantization fp6.")
            return ggml_tensor_qtype["fp6"]
    return qtype


# Arc platfrom does not support FP64,
# Disable FP64 in DeepSpeedZeroOptimizer_Stage3's _constant_buffered_norm2  method
# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/zero/stage3.py#L1365
def _constant_buffered_norm2(self, input, buffer_size=250000000):
    norm = None
    for part in input.view(-1).split(buffer_size):
        if norm is None:
            norm = part.data.norm(2)**2.0
        else:
            norm += part.data.norm(2)**2.0
    return norm**0.5