[llm] Add convert_model api (#8244)
* add convert_model api * change the model_path to input_path * map int4 to q4_0 * fix blank line * change bloomz to bloom * remove default model_family * change dtype to lower first
This commit is contained in:
		
							parent
							
								
									e290660b20
								
							
						
					
					
						commit
						2bc0e7abbb
					
				
					 3 changed files with 50 additions and 7 deletions
				
			
		| 
						 | 
				
			
			@ -21,3 +21,4 @@
 | 
			
		|||
 | 
			
		||||
from .quantize import quantize
 | 
			
		||||
from .convert import _convert_to_ggml
 | 
			
		||||
from .convert_model import convert_model
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										41
									
								
								python/llm/src/bigdl/llm/ggml/convert_model.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								python/llm/src/bigdl/llm/ggml/convert_model.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,41 @@
 | 
			
		|||
from bigdl.llm.ggml.convert import _convert_to_ggml
 | 
			
		||||
from bigdl.llm.ggml.quantize import quantize
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def convert_model(input_path: str,
 | 
			
		||||
                  output_path: str,
 | 
			
		||||
                  model_family: str,
 | 
			
		||||
                  dtype: str = 'int4'):
 | 
			
		||||
    """
 | 
			
		||||
    Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to lower precision
 | 
			
		||||
 | 
			
		||||
    :param input_path: str, path of model, for example `./llama-7b-hf`.
 | 
			
		||||
    :param output_path: Save path of output quantized model. Default to `None`.
 | 
			
		||||
            If you don't specify this parameter, quantized model will be saved in
 | 
			
		||||
            the same directory as the input and just replace precision with quantize_type
 | 
			
		||||
            like `./ggml-model-q4_0.bin`.
 | 
			
		||||
    :param model_family: Which model family your input model belongs to.
 | 
			
		||||
            Now only `llama`/`bloom`/`gptneox` are supported.
 | 
			
		||||
    :param dtype: Which quantized precision will be converted.
 | 
			
		||||
            Now only int4 supported.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    dtype = dtype.lower()
 | 
			
		||||
    if dtype == 'int4':
 | 
			
		||||
        dtype = 'q4_0'
 | 
			
		||||
 | 
			
		||||
    model_name = Path(input_path).stem
 | 
			
		||||
    tmp_ggml_file_path = f'/tmp/{model_name}_{int(time.time())}'
 | 
			
		||||
    _convert_to_ggml(model_path=input_path,
 | 
			
		||||
                     outfile_dir=tmp_ggml_file_path,
 | 
			
		||||
                     model_family=model_family,
 | 
			
		||||
                     outtype="fp16")
 | 
			
		||||
    
 | 
			
		||||
    tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir())
 | 
			
		||||
 | 
			
		||||
    quantize(input_path=tmp_ggml_file_path,
 | 
			
		||||
             output_path=output_path,
 | 
			
		||||
             model_family=model_family,
 | 
			
		||||
             dtype=dtype)
 | 
			
		||||
| 
						 | 
				
			
			@ -18,6 +18,7 @@ import os
 | 
			
		|||
import subprocess
 | 
			
		||||
from bigdl.llm.utils.common import invalidInputError
 | 
			
		||||
import platform
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
dirname, _ = os.path.split(os.path.abspath(__file__))
 | 
			
		||||
| 
						 | 
				
			
			@ -28,7 +29,7 @@ _llama_quantize_type = {"q4_0": 2,
 | 
			
		|||
                        "q5_0": 8,
 | 
			
		||||
                        "q5_1": 9,
 | 
			
		||||
                        "q8_0": 7}
 | 
			
		||||
_bloomz_quantize_type = {"q4_0": 2,
 | 
			
		||||
_bloom_quantize_type = {"q4_0": 2,
 | 
			
		||||
                         "q4_1": 3}
 | 
			
		||||
_gptneox_quantize_type = {"q4_0": 2,
 | 
			
		||||
                          "q4_1": 3,
 | 
			
		||||
| 
						 | 
				
			
			@ -38,7 +39,7 @@ _gptneox_quantize_type = {"q4_0": 2,
 | 
			
		|||
                          "q8_0": 7}
 | 
			
		||||
 | 
			
		||||
_quantize_type = {"llama": _llama_quantize_type,
 | 
			
		||||
                  "bloomz": _bloomz_quantize_type,
 | 
			
		||||
                  "bloom": _bloom_quantize_type,
 | 
			
		||||
                  "gptneox": _gptneox_quantize_type}
 | 
			
		||||
 | 
			
		||||
_valid_types = set(list(_llama_quantize_type.keys()) + list(_bloomz_quantize_type.keys()) +
 | 
			
		||||
| 
						 | 
				
			
			@ -56,23 +57,23 @@ def quantize(input_path: str, output_path: str=None,
 | 
			
		|||
            the same directory as the input and just replace precision with quantize_type
 | 
			
		||||
            like `./ggml-model-q4_0.bin`.
 | 
			
		||||
    :param model_family: Which model family your input model belongs to. Default to `llama`.
 | 
			
		||||
            Now only `llama`/`bloomz`/`gptneox` are supported.
 | 
			
		||||
            Now only `llama`/`bloom`/`gptneox` are supported.
 | 
			
		||||
    :param dtype: Quantization method which differs in the resulting model disk size and
 | 
			
		||||
            inference speed. Defalut to `q4_0`. Difference model family may support different types,
 | 
			
		||||
            now the supported list is:
 | 
			
		||||
            llama : "q4_0", "q4_1", "q4_2"
 | 
			
		||||
            bloomz : "q4_0", "q4_1"
 | 
			
		||||
            bloom : "q4_0", "q4_1"
 | 
			
		||||
            gptneox : "q4_0", "q4_1", "q4_2", "q5_0", "q5_1", "q8_0"
 | 
			
		||||
    """
 | 
			
		||||
    invalidInputError(model_family in ['llama', 'bloomz', 'gptneox'],
 | 
			
		||||
    invalidInputError(model_family in ['llama', 'bloom', 'gptneox'],
 | 
			
		||||
                      "Now we only support quantization of model \
 | 
			
		||||
                       family('llama', 'bloomz', 'gptneox')",
 | 
			
		||||
                       family('llama', 'bloom', 'gptneox')",
 | 
			
		||||
                      "{} is not in the list.".format(model_family))
 | 
			
		||||
    invalidInputError(os.path.isfile(input_path),
 | 
			
		||||
                      "The file {} was not found".format(input_path))
 | 
			
		||||
    # TODO : multi input model path
 | 
			
		||||
    if output_path is None:
 | 
			
		||||
        output_path = input_path.replace("f16", dtype)
 | 
			
		||||
        output_path = Path(str(input_path).replace('f16', dtype))
 | 
			
		||||
    # convert quantize type str into corresponding int value
 | 
			
		||||
    quantize_type_map = _quantize_type[model_family]
 | 
			
		||||
    invalidInputError(dtype in quantize_type_map, "{0} model just accept {1} now, \
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue