LLM: Support gguf models use low_bit and fix no json(#10408)
* support others model use low_bit * update readme * update to add *.json
This commit is contained in:
		
							parent
							
								
									cda38f85a9
								
							
						
					
					
						commit
						fe8976a00f
					
				
					 5 changed files with 9 additions and 6 deletions
				
			
		| 
						 | 
				
			
			@ -61,6 +61,7 @@ In the example, several arguments can be passed to satisfy your requirements:
 | 
			
		|||
- `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf`
 | 
			
		||||
- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`.
 | 
			
		||||
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 | 
			
		||||
- `--low_bit`: use what low_bit to run, default is `sym_int4`.
 | 
			
		||||
 | 
			
		||||
#### 2.4 Sample Output
 | 
			
		||||
#### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -57,6 +57,7 @@ In the example, several arguments can be passed to satisfy your requirements:
 | 
			
		|||
- `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf`
 | 
			
		||||
- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`.
 | 
			
		||||
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 | 
			
		||||
- `--low_bit`: use what low_bit to run, default is `sym_int4`.
 | 
			
		||||
 | 
			
		||||
#### 3.4 Sample Output
 | 
			
		||||
#### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -37,6 +37,8 @@ if __name__ == '__main__':
 | 
			
		|||
                        help='Prompt to infer')
 | 
			
		||||
    parser.add_argument('--n-predict', type=int, default=32,
 | 
			
		||||
                        help='Max tokens to predict')
 | 
			
		||||
    parser.add_argument('--low_bit', type=str, default="sym_int4",
 | 
			
		||||
                        help='what low_bit to run bigdl-llm')
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -311,7 +311,7 @@ def setup_package():
 | 
			
		|||
        packages=get_llm_packages(),
 | 
			
		||||
        package_dir={"": "src"},
 | 
			
		||||
        package_data={
 | 
			
		||||
            "bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"]},
 | 
			
		||||
            "bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"] + ["transformers/gguf/models/model_implement/*/*.json"]},
 | 
			
		||||
        include_package_data=True,
 | 
			
		||||
        entry_points={
 | 
			
		||||
            "console_scripts": [
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -37,7 +37,6 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str =
 | 
			
		|||
    qtype = loader.config["general.file_type"]
 | 
			
		||||
 | 
			
		||||
    invalidInputError(qtype in qtype_map, f"Unsupported gguf quantize type: {qtype}")
 | 
			
		||||
    low_bit = qtype_map.get(qtype, "sym_int4")
 | 
			
		||||
 | 
			
		||||
    with torch.no_grad():
 | 
			
		||||
        if model_family == "llama":
 | 
			
		||||
| 
						 | 
				
			
			@ -45,19 +44,19 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str =
 | 
			
		|||
            if "mixtral" in general_name:
 | 
			
		||||
                # mixtral, which also enjoys a general architecture of llama
 | 
			
		||||
                from .models.mixtral import load_gguf_mixtral
 | 
			
		||||
                model, tokenizer = load_gguf_mixtral(loader, dtype)
 | 
			
		||||
                model, tokenizer = load_gguf_mixtral(loader, dtype, low_bit)
 | 
			
		||||
            elif "mistral" in general_name:
 | 
			
		||||
                from .models.mistral import load_gguf_mistral
 | 
			
		||||
                model, tokenizer = load_gguf_mistral(loader, dtype)
 | 
			
		||||
                model, tokenizer = load_gguf_mistral(loader, dtype, low_bit)
 | 
			
		||||
            elif "yuan" in general_name:
 | 
			
		||||
                from .models.yuan2 import load_gguf_yuan
 | 
			
		||||
                model, tokenizer = load_gguf_yuan(loader, dtype)
 | 
			
		||||
            else:
 | 
			
		||||
                from .models.llama import load_gguf_llama
 | 
			
		||||
                model, tokenizer = load_gguf_llama(loader, dtype)
 | 
			
		||||
                model, tokenizer = load_gguf_llama(loader, dtype, low_bit)
 | 
			
		||||
        elif model_family == "baichuan":
 | 
			
		||||
            from .models.baichuan import load_gguf_baichuan
 | 
			
		||||
            model, tokenizer = load_gguf_baichuan(loader, dtype)
 | 
			
		||||
            model, tokenizer = load_gguf_baichuan(loader, dtype, low_bit)
 | 
			
		||||
        elif model_family == "bloom":
 | 
			
		||||
            from .models.bloom import load_gguf_bloom
 | 
			
		||||
            model, tokenizer = load_gguf_bloom(loader, dtype)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue