diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md index 80f4a286..9eec31d5 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md @@ -61,6 +61,7 @@ In the example, several arguments can be passed to satisfy your requirements: - `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf` - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. +- `--low_bit`: use what low_bit to run, default is `sym_int4`. #### 2.4 Sample Output #### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md index db888290..b75bb179 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md @@ -57,6 +57,7 @@ In the example, several arguments can be passed to satisfy your requirements: - `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf` - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. +- `--low_bit`: use what low_bit to run, default is `sym_int4`. #### 3.4 Sample Output #### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py index af88b2cf..e8ee9c36 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py @@ -37,6 +37,8 @@ if __name__ == '__main__': help='Prompt to infer') parser.add_argument('--n-predict', type=int, default=32, help='Max tokens to predict') + parser.add_argument('--low_bit', type=str, default="sym_int4", + help='what low_bit to run bigdl-llm') args = parser.parse_args() diff --git a/python/llm/setup.py b/python/llm/setup.py index 4ff4333d..94491423 100644 --- a/python/llm/setup.py +++ b/python/llm/setup.py @@ -311,7 +311,7 @@ def setup_package(): packages=get_llm_packages(), package_dir={"": "src"}, package_data={ - "bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"]}, + "bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"] + ["transformers/gguf/models/model_implement/*/*.json"]}, include_package_data=True, entry_points={ "console_scripts": [ diff --git a/python/llm/src/bigdl/llm/transformers/gguf/api.py b/python/llm/src/bigdl/llm/transformers/gguf/api.py index 4f835ff2..020a91ba 100644 --- a/python/llm/src/bigdl/llm/transformers/gguf/api.py +++ b/python/llm/src/bigdl/llm/transformers/gguf/api.py @@ -37,7 +37,6 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str = qtype = loader.config["general.file_type"] invalidInputError(qtype in qtype_map, f"Unsupported gguf quantize type: {qtype}") - low_bit = qtype_map.get(qtype, "sym_int4") with torch.no_grad(): if model_family == "llama": @@ -45,19 +44,19 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str = if "mixtral" in general_name: # mixtral, which also enjoys a general architecture of llama from .models.mixtral import load_gguf_mixtral - model, tokenizer = load_gguf_mixtral(loader, dtype) + model, tokenizer = load_gguf_mixtral(loader, dtype, low_bit) elif "mistral" in general_name: from .models.mistral import load_gguf_mistral - model, tokenizer = load_gguf_mistral(loader, dtype) + model, tokenizer = load_gguf_mistral(loader, dtype, low_bit) elif "yuan" in general_name: from .models.yuan2 import load_gguf_yuan model, tokenizer = load_gguf_yuan(loader, dtype) else: from .models.llama import load_gguf_llama - model, tokenizer = load_gguf_llama(loader, dtype) + model, tokenizer = load_gguf_llama(loader, dtype, low_bit) elif model_family == "baichuan": from .models.baichuan import load_gguf_baichuan - model, tokenizer = load_gguf_baichuan(loader, dtype) + model, tokenizer = load_gguf_baichuan(loader, dtype, low_bit) elif model_family == "bloom": from .models.bloom import load_gguf_bloom model, tokenizer = load_gguf_bloom(loader, dtype)