Support qwen2.5 3B for NPU & update related examples (#12438)

* update qwen2.5-3B * update convert * small fix * replace load_in_low_bit with low_bit * small fix
2024-11-25 00:38:31 -08:00 · 2024-11-25 00:38:31 -08:00 · b9abb8a285
commit b9abb8a285
parent b633fbf26c
8 changed files with 48 additions and 19 deletions
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md
@ -6,7 +6,7 @@ In this directory, you will find a C++ example on how to run LLM models on Intel
 | Model      | Model Link                                                    |
 |------------|----------------------------------------------------------------|
 | Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
-| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) |
+| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) |
 | Llama2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) |
 | Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) |
 | MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |
@ -35,9 +35,26 @@ pip install transformers==4.45.0 accelerate==0.33.0
 We provide a [convert script](convert.py) under current directory, by running it, you can obtain the whole weights and configuration files which are required to run C++ example.

 ```cmd
-:: to convert Qwen2.5-7b-Instruct
+:: to convert Qwen2.5-7B-Instruct
 python convert.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory <converted_model_path>

+:: to convert Qwen2-1.5B-Instruct
+python convert.py --repo-id-or-model-path Qwen/Qwen2-1.5B-Instruct --save-directory <converted_model_path>
+
+:: to convert Qwen2.5-3B-Instruct
+python convert.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --save-directory <converted_model_path> --low_bit "sym_int8"
+
+:: to convert Llama-2-7b-chat-hf
+python convert.py --repo-id-or-model-path meta-llama/Llama-2-7b-chat-hf --save-directory <converted_model_path>
+
+:: to convert Meta-Llama-3-8B-Instruct
+python convert.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --save-directory <converted_model_path>
+
+:: to convert MiniCPM-1B-sft-bf16
+python convert.py --repo-id-or-model-path openbmb/MiniCPM-1B-sft-bf16 --save-directory <converted_model_path>
+
+:: to convert MiniCPM-2B-sft-bf16
+python convert.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --save-directory <converted_model_path>
 ```

 Arguments info:
@ -45,6 +62,7 @@ Arguments info:
 - `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted model will be saved into `SAVE_DIRECTORY`.
 - `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `960`.
+- `--low_bit LOW_BIT`: Defines the low bit precision to quantize the model. It is default to be `sym_int4`.
 - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.

 ## 3. Build C++ Example `llm-npu-cli`
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py
@ -43,8 +43,8 @@ if __name__ == "__main__":
    parser.add_argument("--max-context-len", type=int, default=1024)
    parser.add_argument("--max-prompt-len", type=int, default=960)
    parser.add_argument("--quantization_group_size", type=int, default=0)
-    parser.add_argument('--load_in_low_bit', type=str, default="sym_int4",
-                        help='Load in low bit to use')
+    parser.add_argument('--low_bit', type=str, default="sym_int4",
+                        help='Low bit precision to quantize the model')
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)

    args = parser.parse_args()
@ -54,7 +54,7 @@ if __name__ == "__main__":
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 optimize_model=True,
                                                 pipeline=True,
-                                                 load_in_low_bit=args.load_in_low_bit,
+                                                 load_in_low_bit=args.low_bit,
                                                 max_context_len=args.max_context_len,
                                                 max_prompt_len=args.max_prompt_len,
                                                 quantization_group_size=args.quantization_group_size,
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md
@ -10,7 +10,7 @@ In this directory, you will find examples on how to directly run HuggingFace `tr
 | Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) |
 | Llama3.2 | [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct), [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) |
 | Qwen2 | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
-| Qwen2.5 | [Qwen/Qwen2.5-7b-Instruct](https://huggingface.co/Qwen/Qwen2.5-7b-Instruct) |
+| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) |
 | Baichuan2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan-7B-Chat) |
 | MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |

@ -58,11 +58,14 @@ python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct"
 :: to run Llama-3.2-3B-Instruct
 python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct"

-:: to run Qwen2.5-7b-Instruct
+:: to run Qwen2.5-7B-Instruct
 python qwen.py

-:: to run Qwen2-1.5b-Instruct
-python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --load_in_low_bit "sym_int8"
+:: to run Qwen2-1.5B-Instruct
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low_bit "sym_int8"
+
+:: to run Qwen2.5-3B-Instruct
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low_bit "sym_int8"

 :: to run Baichuan2-7B-Chat
 python baichuan2.py
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py
@ -48,8 +48,8 @@ if __name__ == "__main__":
    parser.add_argument("--max-context-len", type=int, default=1024)
    parser.add_argument("--max-prompt-len", type=int, default=960)
    parser.add_argument("--quantization_group_size", type=int, default=0)
-    parser.add_argument('--load_in_low_bit', type=str, default="sym_int4",
-                        help='Load in low bit to use')
+    parser.add_argument('--low_bit', type=str, default="sym_int4",
+                        help='Low bit precision to quantize the model')
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
    parser.add_argument("--disable-streaming", action="store_true", default=False)

@ -60,7 +60,7 @@ if __name__ == "__main__":
        model = AutoModelForCausalLM.from_pretrained(model_path,
                                                     optimize_model=True,
                                                     pipeline=True,
-                                                     load_in_low_bit=args.load_in_low_bit,
+                                                     load_in_low_bit=args.low_bit,
                                                     max_context_len=args.max_context_len,
                                                     max_prompt_len=args.max_prompt_len,
                                                     quantization_group_size=args.quantization_group_size,
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
@ -70,7 +70,7 @@ Arguments info:
 - `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
 - `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `'Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun'`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
- `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used.
+- `--low_bit`: argument defining the `low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used.

 ### Sample Output
 #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
@ -90,6 +90,7 @@ The examples below show how to run the **_optimized HuggingFace model implementa
 - [Llama3.2-1B](./llama.py)
 - [Llama3.2-3B](./llama.py)
 - [Qwen2-1.5B](./qwen.py)
+- [Qwen2.5-3B](./qwen.py)
 - [Qwen2.5-7B](./qwen.py)
 - [MiniCPM-1B](./minicpm.py)
 - [MiniCPM-2B](./minicpm.py)
@ -122,6 +123,9 @@ python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct
 :: to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715)
 python qwen.py

+:: to run Qwen2.5-3B-Instruct (LNL driver version: 32.0.101.2715)
+python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8
+
 :: to run Qwen2.5-7B-Instruct (LNL driver version: 32.0.101.2715)
 python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct

--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py
@ -47,7 +47,10 @@ if __name__ == "__main__":
                        help='Prompt to infer')
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
    parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=512)
+    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--quantization_group_size", type=int, default=0)
+    parser.add_argument('--low_bit', type=str, default="sym_int4",
+                        help='Load in low bit to use')
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
    parser.add_argument("--intra-pp", type=int, default=None)
    parser.add_argument("--inter-pp", type=int, default=None)
@ -62,14 +65,15 @@ if __name__ == "__main__":
            torch_dtype=torch.float16,
            trust_remote_code=True,
            attn_implementation="eager",
-            load_in_low_bit="sym_int4",
+            load_in_low_bit=args.low_bit,
            optimize_model=True,
            max_context_len=args.max_context_len,
            max_prompt_len=args.max_prompt_len,
            intra_pp=args.intra_pp,
            inter_pp=args.inter_pp,
            transpose_value_cache=not args.disable_transpose_value_cache,
-            mixed_precision=args.mixed_precision
+            mixed_precision=args.mixed_precision,
+            quantization_group_size=args.quantization_group_size,
        )
    else:
        model = AutoModelForCausalLM.load_low_bit(
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
@ -64,7 +64,7 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
    iqtype = ggml_tensor_qtype[qtype]
    if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
        if qtype == "sym_int4_rtn":
-            # workaround for qwen2 & int4
+            # workaround for qwen2-7B & int4
            if (layer.in_features == 3584 and layer.out_features == 152064) or \
               (layer.in_features == 18944 and layer.out_features == 3584):
                qtype = "sym_int8_rtn"
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
@ -428,8 +428,8 @@ def optimize_llm(
                      intra_pp=intra_pp,
                      decoder=True,
                      transpose_value_cache=transpose_value_cache)
-    elif model.config.model_type == "qwen2" and model.config.num_hidden_layers == 28:
-        # for qwen2-1.5B and qwen2-7B
+    elif model.config.model_type == "qwen2":
+        # for qwen2-1.5B, qwen2-7B, qwen2.5-3B
        if intra_pp is None:
            intra_pp = 2
        if inter_pp is None: