LLM: Support gguf models use low_bit and fix no json(#10408)

* support others model use low_bit * update readme * update to add *.json
2024-03-15 09:34:18 +08:00 · 2024-03-15 09:34:18 +08:00 · fe8976a00f
commit fe8976a00f
parent cda38f85a9
5 changed files with 9 additions and 6 deletions
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
@ -61,6 +61,7 @@ In the example, several arguments can be passed to satisfy your requirements:
 - `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf`
 - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
+- `--low_bit`: use what low_bit to run, default is `sym_int4`.

 #### 2.4 Sample Output
 #### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main)
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
@ -57,6 +57,7 @@ In the example, several arguments can be passed to satisfy your requirements:
 - `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf`
 - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
+- `--low_bit`: use what low_bit to run, default is `sym_int4`.

 #### 3.4 Sample Output
 #### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main)
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
@ -37,6 +37,8 @@ if __name__ == '__main__':
                        help='Prompt to infer')
    parser.add_argument('--n-predict', type=int, default=32,
                        help='Max tokens to predict')
+    parser.add_argument('--low_bit', type=str, default="sym_int4",
+                        help='what low_bit to run bigdl-llm')

    args = parser.parse_args()

--- a/python/llm/setup.py
+++ b/python/llm/setup.py
@ -311,7 +311,7 @@ def setup_package():
        packages=get_llm_packages(),
        package_dir={"": "src"},
        package_data={
-            "bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"]},
+            "bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"] + ["transformers/gguf/models/model_implement/*/*.json"]},
        include_package_data=True,
        entry_points={
            "console_scripts": [
--- a/python/llm/src/bigdl/llm/transformers/gguf/api.py
+++ b/python/llm/src/bigdl/llm/transformers/gguf/api.py
@ -37,7 +37,6 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str =
    qtype = loader.config["general.file_type"]

    invalidInputError(qtype in qtype_map, f"Unsupported gguf quantize type: {qtype}")
-    low_bit = qtype_map.get(qtype, "sym_int4")

    with torch.no_grad():
        if model_family == "llama":
@ -45,19 +44,19 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str =
            if "mixtral" in general_name:
                # mixtral, which also enjoys a general architecture of llama
                from .models.mixtral import load_gguf_mixtral
-                model, tokenizer = load_gguf_mixtral(loader, dtype)
+                model, tokenizer = load_gguf_mixtral(loader, dtype, low_bit)
            elif "mistral" in general_name:
                from .models.mistral import load_gguf_mistral
-                model, tokenizer = load_gguf_mistral(loader, dtype)
+                model, tokenizer = load_gguf_mistral(loader, dtype, low_bit)
            elif "yuan" in general_name:
                from .models.yuan2 import load_gguf_yuan
                model, tokenizer = load_gguf_yuan(loader, dtype)
            else:
                from .models.llama import load_gguf_llama
-                model, tokenizer = load_gguf_llama(loader, dtype)
+                model, tokenizer = load_gguf_llama(loader, dtype, low_bit)
        elif model_family == "baichuan":
            from .models.baichuan import load_gguf_baichuan
-            model, tokenizer = load_gguf_baichuan(loader, dtype)
+            model, tokenizer = load_gguf_baichuan(loader, dtype, low_bit)
        elif model_family == "bloom":
            from .models.bloom import load_gguf_bloom
            model, tokenizer = load_gguf_bloom(loader, dtype)