This PR add temporary qtype woq_int4 to avoid affecting other qtype and models. Co-authored-by: leonardozcm <leonardo1997zcm@gmail.com>
142 lines
6.2 KiB
Python
142 lines
6.2 KiB
Python
#
|
|
# Copyright 2016 The BigDL Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import os
|
|
import subprocess
|
|
from ipex_llm.utils.common import invalidInputError
|
|
import platform
|
|
from pathlib import Path
|
|
|
|
|
|
dirname, _ = os.path.split(os.path.abspath(__file__))
|
|
libs_dirname = os.path.dirname(dirname)
|
|
|
|
# ggml quantized tensor type, this is different from below file quantized type(_quantize_type)
|
|
ggml_tensor_qtype = {"sym_int4": 2, # q4_0 in ggml
|
|
"asym_int4": 3, # q4_1 in ggml
|
|
"sym_int5": 6, # q5_0 in ggml
|
|
"asym_int5": 7, # q5_1 in ggml
|
|
"sym_int8": 8, # q8_0 in ggml
|
|
"nf4": 10,
|
|
"nf3": 11,
|
|
"fp16": 12,
|
|
"fp8_e4m3": 15, # fp8 in e4m3 format
|
|
"fp4": 16,
|
|
"mixed_fp4": 17, # Mixture of Formats Quantization 4 bits
|
|
"mixed_fp8": 18, # Mixture of Formats Quantization 8 bits
|
|
"fp8_e5m2": 19, # fp8 in e5m2 format
|
|
"fp8": 19, # fp8 in e5m2 format
|
|
"bf16": 20,
|
|
"gguf_iq2_xxs": 21,
|
|
"gguf_iq2_xs": 22,
|
|
"q2_k": 23,
|
|
"gguf_iq1_s": 24,
|
|
"gguf_iq1_m": 25,
|
|
"q6_k": 26,
|
|
"q4_k": 27,
|
|
"q5_k": 28,
|
|
"fp6": 29,
|
|
"fp6_k": 30,
|
|
"sym_int4_rtn": 31,
|
|
"sym_int8_rtn": 32,
|
|
"asym_int4_rtn": 33,
|
|
"woq_int4": 34,
|
|
}
|
|
|
|
# mixed precison from llama.cpp
|
|
gguf_mixed_qtype = {"gguf_q4k_s": 101,
|
|
"gguf_q4k_m": 102}
|
|
|
|
_llama_quantize_type = {"q4_0": 2,
|
|
"q4_1": 3,
|
|
"q5_0": 8,
|
|
"q5_1": 9,
|
|
"q8_0": 7}
|
|
_bloom_quantize_type = {"q4_0": 2,
|
|
"q4_1": 3}
|
|
_gptneox_quantize_type = {"q4_0": 2,
|
|
"q4_1": 3,
|
|
"q5_0": 8,
|
|
"q5_1": 9,
|
|
"q8_0": 7}
|
|
_starcoder_quantize_type = {"q4_0": 2,
|
|
"q4_1": 3,
|
|
"q5_0": 8,
|
|
"q5_1": 9,
|
|
"q8_0": 7}
|
|
|
|
_quantize_type = {"llama": _llama_quantize_type,
|
|
"bloom": _bloom_quantize_type,
|
|
"gptneox": _gptneox_quantize_type,
|
|
"starcoder": _starcoder_quantize_type}
|
|
|
|
|
|
def quantize(input_path: str, output_path: str,
|
|
model_family: str, dtype: str='q4_0'):
|
|
"""
|
|
Quantize ggml file to lower precision.
|
|
|
|
:param input_path: Path of input ggml file, for example `./ggml-model-f16.bin`.
|
|
:param output_path: Save path of output quantized model. You must pass a directory to
|
|
save all related output. Filename of quantized model will be like
|
|
`bigdl_llm_llama_q4_0.bin`.
|
|
:param model_family: Which model family your input model belongs to.
|
|
Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported.
|
|
:param dtype: Quantization method which differs in the resulting model disk size and
|
|
inference speed. Defalut to `q4_0`. Difference model family may support
|
|
different types, now the supported list is:
|
|
llama : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
|
|
bloom : "q4_0", "q4_1"
|
|
gptneox : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
|
|
starcoder : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
|
|
|
|
:return: the path str to the converted ggml binary checkpoint
|
|
"""
|
|
invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'],
|
|
"Now we only support quantization of model \
|
|
family('llama', 'bloom', 'gptneox', 'starcoder')",
|
|
"{} is not in the list.".format(model_family))
|
|
invalidInputError(os.path.isfile(input_path),
|
|
"The file {} is not found".format(input_path))
|
|
invalidInputError(os.path.isdir(output_path),
|
|
"The output_path {} is not a directory".format(output_path))
|
|
# convert quantize type str into corresponding int value
|
|
quantize_type_map = _quantize_type[model_family]
|
|
output_filename = "bigdl_llm_{}_{}.bin".format(model_family,
|
|
dtype.lower())
|
|
output_path = os.path.join(output_path, output_filename)
|
|
invalidInputError(dtype.lower() in quantize_type_map, "{0} model just accept {1} now, \
|
|
but you pass in {2}.".format(
|
|
model_family,
|
|
list(quantize_type_map.keys()),
|
|
dtype))
|
|
quantize_type = quantize_type_map[dtype]
|
|
if platform.platform().startswith('Windows'):
|
|
suffix = '.exe'
|
|
else:
|
|
suffix = ''
|
|
quantize_args = "{0}/libs/quantize-{1}{2} {3} {4} {5}".format(libs_dirname,
|
|
model_family,
|
|
suffix,
|
|
input_path,
|
|
output_path,
|
|
str(quantize_type))
|
|
p = subprocess.run(quantize_args.split(), capture_output=True)
|
|
error_message = p.stderr
|
|
invalidInputError(not p.returncode,
|
|
"Fail to quantize {}, error message is {}.".format(str(input_path),
|
|
error_message))
|
|
return str(output_path)
|