[NPU pipeline] Support save & load and update examples (#12293)
* support save & load, update llama examples * update baichuan2 example * update readme
This commit is contained in:
		
							parent
							
								
									5a15098835
								
							
						
					
					
						commit
						2b2cb9c693
					
				
					 8 changed files with 147 additions and 56 deletions
				
			
		| 
						 | 
					@ -51,9 +51,12 @@ python baichuan2.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Arguments info:
 | 
					Arguments info:
 | 
				
			||||||
- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder.
 | 
					- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder.
 | 
				
			||||||
 | 
					- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
 | 
				
			||||||
- `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`.
 | 
					- `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`.
 | 
				
			||||||
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 | 
					- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 | 
				
			||||||
- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 | 
					- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 | 
				
			||||||
 | 
					- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 | 
				
			||||||
 | 
					- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Sample Output
 | 
					### Sample Output
 | 
				
			||||||
#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
 | 
					#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,6 +15,7 @@
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
import argparse
 | 
					import argparse
 | 
				
			||||||
| 
						 | 
					@ -48,28 +49,49 @@ if __name__ == "__main__":
 | 
				
			||||||
        help="The huggingface repo id for the Baichuan2 model to be downloaded"
 | 
					        help="The huggingface repo id for the Baichuan2 model to be downloaded"
 | 
				
			||||||
        ", or the path to the huggingface checkpoint folder",
 | 
					        ", or the path to the huggingface checkpoint folder",
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument("--lowbit-path", type=str,
 | 
				
			||||||
 | 
					        default="",
 | 
				
			||||||
 | 
					        help="The path to the lowbit model folder, leave blank if you do not want to save. \
 | 
				
			||||||
 | 
					            If path not exists, lowbit model will be saved there. \
 | 
				
			||||||
 | 
					            Else, lowbit model will be loaded.",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
					    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
				
			||||||
                        help='Prompt to infer')
 | 
					                        help='Prompt to infer')
 | 
				
			||||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
					    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
				
			||||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
					    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
				
			||||||
    parser.add_argument("--max-prompt-len", type=int, default=960)
 | 
					    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
				
			||||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
					    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    args = parser.parse_args()
 | 
					    args = parser.parse_args()
 | 
				
			||||||
    model_path = args.repo_id_or_model_path
 | 
					    model_path = args.repo_id_or_model_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    model = AutoModelForCausalLM.from_pretrained(model_path,
 | 
					    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
 | 
				
			||||||
                                                 optimize_model=True,
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path,
 | 
				
			||||||
                                                 pipeline=True,
 | 
					                                                     optimize_model=True,
 | 
				
			||||||
                                                 max_context_len=args.max_context_len,
 | 
					                                                     pipeline=True,
 | 
				
			||||||
                                                 max_prompt_len=args.max_prompt_len,
 | 
					                                                     max_context_len=args.max_context_len,
 | 
				
			||||||
                                                 torch_dtype=torch.float16,
 | 
					                                                     max_prompt_len=args.max_prompt_len,
 | 
				
			||||||
                                                 attn_implementation="eager",
 | 
					                                                     torch_dtype=torch.float16,
 | 
				
			||||||
                                                 transpose_value_cache=not args.disable_transpose_value_cache,
 | 
					                                                     attn_implementation="eager",
 | 
				
			||||||
                                                 trust_remote_code=True)
 | 
					                                                     transpose_value_cache=not args.disable_transpose_value_cache,
 | 
				
			||||||
 | 
					                                                     trust_remote_code=True)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        model = AutoModelForCausalLM.load_low_bit(
 | 
				
			||||||
 | 
					            args.lowbit_path,
 | 
				
			||||||
 | 
					            attn_implementation="eager",
 | 
				
			||||||
 | 
					            torch_dtype=torch.float16,
 | 
				
			||||||
 | 
					            max_context_len=args.max_context_len,
 | 
				
			||||||
 | 
					            max_prompt_len=args.max_prompt_len,
 | 
				
			||||||
 | 
					            pipeline=True,
 | 
				
			||||||
 | 
					            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
				
			||||||
 | 
					            trust_remote_code=True
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if args.lowbit_path and not os.path.exists(args.lowbit_path):
 | 
				
			||||||
 | 
					        model.save_low_bit(args.lowbit_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DEFAULT_SYSTEM_PROMPT = """\
 | 
					    DEFAULT_SYSTEM_PROMPT = """\
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,6 +15,7 @@
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
import argparse
 | 
					import argparse
 | 
				
			||||||
| 
						 | 
					@ -48,29 +49,49 @@ if __name__ == "__main__":
 | 
				
			||||||
        help="The huggingface repo id for the Llama2 model to be downloaded"
 | 
					        help="The huggingface repo id for the Llama2 model to be downloaded"
 | 
				
			||||||
        ", or the path to the huggingface checkpoint folder",
 | 
					        ", or the path to the huggingface checkpoint folder",
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument("--lowbit-path", type=str,
 | 
				
			||||||
 | 
					        default="",
 | 
				
			||||||
 | 
					        help="The path to the lowbit model folder, leave blank if you do not want to save. \
 | 
				
			||||||
 | 
					            If path not exists, lowbit model will be saved there. \
 | 
				
			||||||
 | 
					            Else, lowbit model will be loaded.",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
					    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
				
			||||||
                        help='Prompt to infer')
 | 
					                        help='Prompt to infer')
 | 
				
			||||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
					    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
				
			||||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
					    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
				
			||||||
 | 
					    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
				
			||||||
    parser.add_argument("--quantization_group_size", type=int, default=0)
 | 
					    parser.add_argument("--quantization_group_size", type=int, default=0)
 | 
				
			||||||
    parser.add_argument("--max-prompt-len", type=int, default=960)
 | 
					 | 
				
			||||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
					    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    args = parser.parse_args()
 | 
					    args = parser.parse_args()
 | 
				
			||||||
    model_path = args.repo_id_or_model_path
 | 
					    model_path = args.repo_id_or_model_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    model = AutoModelForCausalLM.from_pretrained(model_path,
 | 
					    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
 | 
				
			||||||
                                                 optimize_model=True,
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path,
 | 
				
			||||||
                                                 pipeline=True,
 | 
					                                                     optimize_model=True,
 | 
				
			||||||
                                                 max_context_len=args.max_context_len,
 | 
					                                                     pipeline=True,
 | 
				
			||||||
                                                 max_prompt_len=args.max_prompt_len,
 | 
					                                                     max_context_len=args.max_context_len,
 | 
				
			||||||
                                                 quantization_group_size=args.quantization_group_size,
 | 
					                                                     max_prompt_len=args.max_prompt_len,
 | 
				
			||||||
                                                 torch_dtype=torch.float16,
 | 
					                                                     quantization_group_size=args.quantization_group_size,
 | 
				
			||||||
                                                 attn_implementation="eager",
 | 
					                                                     torch_dtype=torch.float16,
 | 
				
			||||||
                                                 transpose_value_cache=not args.disable_transpose_value_cache)
 | 
					                                                     attn_implementation="eager",
 | 
				
			||||||
 | 
					                                                     transpose_value_cache=not args.disable_transpose_value_cache)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        model = AutoModelForCausalLM.load_low_bit(
 | 
				
			||||||
 | 
					            args.lowbit_path,
 | 
				
			||||||
 | 
					            attn_implementation="eager",
 | 
				
			||||||
 | 
					            torch_dtype=torch.float16,
 | 
				
			||||||
 | 
					            max_context_len=args.max_context_len,
 | 
				
			||||||
 | 
					            max_prompt_len=args.max_prompt_len,
 | 
				
			||||||
 | 
					            pipeline=True,
 | 
				
			||||||
 | 
					            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if args.lowbit_path and not os.path.exists(args.lowbit_path):
 | 
				
			||||||
 | 
					        model.save_low_bit(args.lowbit_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DEFAULT_SYSTEM_PROMPT = """\
 | 
					    DEFAULT_SYSTEM_PROMPT = """\
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,6 +15,7 @@
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
import argparse
 | 
					import argparse
 | 
				
			||||||
| 
						 | 
					@ -54,29 +55,49 @@ if __name__ == "__main__":
 | 
				
			||||||
        help="The huggingface repo id for the Llama3 model to be downloaded"
 | 
					        help="The huggingface repo id for the Llama3 model to be downloaded"
 | 
				
			||||||
        ", or the path to the huggingface checkpoint folder",
 | 
					        ", or the path to the huggingface checkpoint folder",
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument("--lowbit-path", type=str,
 | 
				
			||||||
 | 
					        default="",
 | 
				
			||||||
 | 
					        help="The path to the lowbit model folder, leave blank if you do not want to save. \
 | 
				
			||||||
 | 
					            If path not exists, lowbit model will be saved there. \
 | 
				
			||||||
 | 
					            Else, lowbit model will be loaded.",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
					    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
				
			||||||
                        help='Prompt to infer')
 | 
					                        help='Prompt to infer')
 | 
				
			||||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
					    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
				
			||||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
					    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
				
			||||||
    parser.add_argument("--max-prompt-len", type=int, default=960)
 | 
					    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
				
			||||||
    parser.add_argument("--quantization_group_size", type=int, default=0)
 | 
					    parser.add_argument("--quantization_group_size", type=int, default=0)
 | 
				
			||||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
					    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    args = parser.parse_args()
 | 
					    args = parser.parse_args()
 | 
				
			||||||
    model_path = args.repo_id_or_model_path
 | 
					    model_path = args.repo_id_or_model_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    model = AutoModelForCausalLM.from_pretrained(model_path,
 | 
					    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
 | 
				
			||||||
                                                 torch_dtype=torch.float16,
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path,
 | 
				
			||||||
                                                 optimize_model=True,
 | 
					                                                    torch_dtype=torch.float16,
 | 
				
			||||||
                                                 pipeline=True,
 | 
					                                                    optimize_model=True,
 | 
				
			||||||
                                                 max_context_len=args.max_context_len,
 | 
					                                                    pipeline=True,
 | 
				
			||||||
                                                 max_prompt_len=args.max_prompt_len,
 | 
					                                                    max_context_len=args.max_context_len,
 | 
				
			||||||
                                                 quantization_group_size=args.quantization_group_size,
 | 
					                                                    max_prompt_len=args.max_prompt_len,
 | 
				
			||||||
                                                 attn_implementation="eager",
 | 
					                                                    quantization_group_size=args.quantization_group_size,
 | 
				
			||||||
                                                 transpose_value_cache=not args.disable_transpose_value_cache)
 | 
					                                                    attn_implementation="eager",
 | 
				
			||||||
 | 
					                                                    transpose_value_cache=not args.disable_transpose_value_cache)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        model = AutoModelForCausalLM.load_low_bit(
 | 
				
			||||||
 | 
					            args.lowbit_path,
 | 
				
			||||||
 | 
					            attn_implementation="eager",
 | 
				
			||||||
 | 
					            torch_dtype=torch.float16,
 | 
				
			||||||
 | 
					            max_context_len=args.max_context_len,
 | 
				
			||||||
 | 
					            max_prompt_len=args.max_prompt_len,
 | 
				
			||||||
 | 
					            pipeline=True,
 | 
				
			||||||
 | 
					            transpose_value_cache=not args.disable_transpose_value_cache,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if args.lowbit_path and not os.path.exists(args.lowbit_path):
 | 
				
			||||||
 | 
					        model.save_low_bit(args.lowbit_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print("-" * 80)
 | 
					    print("-" * 80)
 | 
				
			||||||
    print("done")
 | 
					    print("done")
 | 
				
			||||||
    with torch.inference_mode():
 | 
					    with torch.inference_mode():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -127,7 +127,7 @@ Arguments info:
 | 
				
			||||||
- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
 | 
					- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
 | 
				
			||||||
- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`.
 | 
					- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`.
 | 
				
			||||||
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 | 
					- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 | 
				
			||||||
- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 | 
					- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 | 
				
			||||||
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 | 
					- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 | 
				
			||||||
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 | 
					- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -166,6 +166,8 @@ class _BaseAutoModelClass:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        logger.info(f"Converting model, it may takes up to several minutes ...")
 | 
					        logger.info(f"Converting model, it may takes up to several minutes ...")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        model.config.update({"optimize_model": optimize_model})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if mock_device == "cpu":
 | 
					        if mock_device == "cpu":
 | 
				
			||||||
            with torch.no_grad():
 | 
					            with torch.no_grad():
 | 
				
			||||||
                # Only mock quantization_group_size=0 for now
 | 
					                # Only mock quantization_group_size=0 for now
 | 
				
			||||||
| 
						 | 
					@ -262,7 +264,6 @@ class _BaseAutoModelClass:
 | 
				
			||||||
                transpose_value_cache=transpose_value_cache,
 | 
					                transpose_value_cache=transpose_value_cache,
 | 
				
			||||||
                group_size=quantization_group_size
 | 
					                group_size=quantization_group_size
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            model.save_low_bit = types.MethodType(save_low_bit, model)
 | 
					 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
 | 
					            from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
 | 
				
			||||||
                import convert_llm
 | 
					                import convert_llm
 | 
				
			||||||
| 
						 | 
					@ -271,7 +272,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
                        max_prompt_len=max_prompt_len,
 | 
					                        max_prompt_len=max_prompt_len,
 | 
				
			||||||
                        transpose_value_cache=transpose_value_cache,
 | 
					                        transpose_value_cache=transpose_value_cache,
 | 
				
			||||||
                        group_size=quantization_group_size)
 | 
					                        group_size=quantization_group_size)
 | 
				
			||||||
 | 
					        model.save_low_bit = types.MethodType(save_low_bit, model)
 | 
				
			||||||
        return model
 | 
					        return model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
| 
						 | 
					@ -304,8 +305,10 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        ignore_argument(kwargs, "pipeline_parallel_stages")
 | 
					        ignore_argument(kwargs, "pipeline_parallel_stages")
 | 
				
			||||||
        ignore_argument(kwargs, "mixed_precision")
 | 
					        ignore_argument(kwargs, "mixed_precision")
 | 
				
			||||||
        ignore_argument(kwargs, "quantization_group_size")
 | 
					        ignore_argument(kwargs, "quantization_group_size")
 | 
				
			||||||
        optimize_model = kwargs.pop("optimize_model", False)
 | 
					        ignore_argument(kwargs, "optimize_model")
 | 
				
			||||||
        max_output_len = kwargs.pop("max_output_len", 1024)
 | 
					        pipeline = kwargs.pop("pipeline", False)
 | 
				
			||||||
 | 
					        max_context_len = kwargs.pop("max_context_len", 1024)
 | 
				
			||||||
 | 
					        max_context_len = max_context_len - 1
 | 
				
			||||||
        max_prompt_len = kwargs.pop("max_prompt_len", 512)
 | 
					        max_prompt_len = kwargs.pop("max_prompt_len", 512)
 | 
				
			||||||
        inter_pp = kwargs.pop("inter_pp", None)
 | 
					        inter_pp = kwargs.pop("inter_pp", None)
 | 
				
			||||||
        intra_pp = kwargs.pop("intra_pp", None)
 | 
					        intra_pp = kwargs.pop("intra_pp", None)
 | 
				
			||||||
| 
						 | 
					@ -355,6 +358,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        bigdl_lcmu_enabled = config_dict.pop("bigdl_lcmu_enabled", True)
 | 
					        bigdl_lcmu_enabled = config_dict.pop("bigdl_lcmu_enabled", True)
 | 
				
			||||||
        mixed_precision = config_dict.pop("mixed_precision", False)
 | 
					        mixed_precision = config_dict.pop("mixed_precision", False)
 | 
				
			||||||
        quantization_group_size = config_dict.pop("group_size", 0)
 | 
					        quantization_group_size = config_dict.pop("group_size", 0)
 | 
				
			||||||
 | 
					        optimize_model = config_dict.pop("optimize_model", False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        invalidInputError(
 | 
					        invalidInputError(
 | 
				
			||||||
            qtype,
 | 
					            qtype,
 | 
				
			||||||
| 
						 | 
					@ -450,13 +454,12 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        quant_device = "meta" if bigdl_lcmu_enabled else "cpu"
 | 
					        quant_device = "meta" if bigdl_lcmu_enabled else "cpu"
 | 
				
			||||||
        logger.info(f"Converting model, it may takes up to several minutes ...")
 | 
					        logger.info(f"Converting model, it may takes up to several minutes ...")
 | 
				
			||||||
        from intel_npu_acceleration_library.compiler import create_npu_kernels
 | 
					        from intel_npu_acceleration_library.compiler import create_npu_kernels
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if optimize_model:
 | 
					        if optimize_model:
 | 
				
			||||||
            invalidInputError(
 | 
					            invalidInputError(
 | 
				
			||||||
                max_prompt_len < max_output_len,
 | 
					                max_prompt_len < max_context_len,
 | 
				
			||||||
                (
 | 
					                (
 | 
				
			||||||
                    f"max_prompt_len ({max_prompt_len}) should be less"
 | 
					                    f"max_prompt_len ({max_prompt_len}) should be less"
 | 
				
			||||||
                    " than max_output_len ({max_output_len})"
 | 
					                    " than max_context_len ({max_context_len})"
 | 
				
			||||||
                ),
 | 
					                ),
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre
 | 
					            from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre
 | 
				
			||||||
| 
						 | 
					@ -468,7 +471,8 @@ class _BaseAutoModelClass:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            with torch.no_grad():
 | 
					            with torch.no_grad():
 | 
				
			||||||
                optimize_llm_pre(model, qtype, mixed_precision,
 | 
					                optimize_llm_pre(model, qtype, mixed_precision,
 | 
				
			||||||
                                 quantization_group_size=quantization_group_size)
 | 
					                                 quantization_group_size=quantization_group_size,
 | 
				
			||||||
 | 
					                                 load=bigdl_lcmu_enabled)
 | 
				
			||||||
                cls.load_convert(qtype, model, quant_device, modules_to_not_convert,
 | 
					                cls.load_convert(qtype, model, quant_device, modules_to_not_convert,
 | 
				
			||||||
                                 quantization_group_size, *model_args, **kwargs)
 | 
					                                 quantization_group_size, *model_args, **kwargs)
 | 
				
			||||||
                create_npu_kernels(llm)
 | 
					                create_npu_kernels(llm)
 | 
				
			||||||
| 
						 | 
					@ -541,17 +545,25 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        for param in model.parameters():
 | 
					        for param in model.parameters():
 | 
				
			||||||
            param.requires_grad_(False)
 | 
					            param.requires_grad_(False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if optimize_model:
 | 
					        if optimize_model and not pipeline:
 | 
				
			||||||
            from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
 | 
					            from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
 | 
				
			||||||
            optimize_llm(
 | 
					            optimize_llm(
 | 
				
			||||||
                llm,
 | 
					                llm,
 | 
				
			||||||
                max_output_len=max_output_len,
 | 
					                max_output_len=max_context_len,
 | 
				
			||||||
                max_prompt_len=max_prompt_len,
 | 
					                max_prompt_len=max_prompt_len,
 | 
				
			||||||
                inter_pp=inter_pp,
 | 
					                inter_pp=inter_pp,
 | 
				
			||||||
                intra_pp=intra_pp,
 | 
					                intra_pp=intra_pp,
 | 
				
			||||||
                transpose_value_cache=transpose_value_cache,
 | 
					                transpose_value_cache=transpose_value_cache,
 | 
				
			||||||
                group_size=quantization_group_size
 | 
					                group_size=quantization_group_size
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					        elif optimize_model and pipeline:
 | 
				
			||||||
 | 
					            from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
 | 
				
			||||||
 | 
					                import convert_llm
 | 
				
			||||||
 | 
					            convert_llm(llm,
 | 
				
			||||||
 | 
					                        kv_len=max_context_len,
 | 
				
			||||||
 | 
					                        max_prompt_len=max_prompt_len,
 | 
				
			||||||
 | 
					                        transpose_value_cache=transpose_value_cache,
 | 
				
			||||||
 | 
					                        group_size=quantization_group_size)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return model
 | 
					        return model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -43,7 +43,7 @@ def reshape_lm_head_input(x):
 | 
				
			||||||
    return x
 | 
					    return x
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def split_linear(module, module_name, n_splits=2):
 | 
					def split_linear(module, module_name, n_splits=2, load=False):
 | 
				
			||||||
    in_features = module.in_features
 | 
					    in_features = module.in_features
 | 
				
			||||||
    invalidInputError(in_features % n_splits == 0,
 | 
					    invalidInputError(in_features % n_splits == 0,
 | 
				
			||||||
                      f"in_features of the linear layer {module_name} must be divisible by"
 | 
					                      f"in_features of the linear layer {module_name} must be divisible by"
 | 
				
			||||||
| 
						 | 
					@ -51,17 +51,27 @@ def split_linear(module, module_name, n_splits=2):
 | 
				
			||||||
    weight_split = torch.tensor_split(module.weight, n_splits, dim=1)
 | 
					    weight_split = torch.tensor_split(module.weight, n_splits, dim=1)
 | 
				
			||||||
    linear_list = torch.nn.ModuleList()
 | 
					    linear_list = torch.nn.ModuleList()
 | 
				
			||||||
    bias = module.bias
 | 
					    bias = module.bias
 | 
				
			||||||
    for idx, weight in enumerate(weight_split):
 | 
					    from transformers.utils.generic import ContextManagers
 | 
				
			||||||
        new_linear = torch.nn.Linear(weight.size(1),
 | 
					    init_contexts = []
 | 
				
			||||||
                                     weight.size(0),
 | 
					    if load:
 | 
				
			||||||
                                     bias=False if bias is None else True)
 | 
					        from transformers.modeling_utils import no_init_weights
 | 
				
			||||||
        new_linear.bias = bias
 | 
					        from accelerate.big_modeling import init_empty_weights
 | 
				
			||||||
        new_linear.weight = torch.nn.Parameter(weight.contiguous(), requires_grad=False)
 | 
					        init_contexts.append(no_init_weights(_enable=load))
 | 
				
			||||||
        linear_list.add_module(f"{module_name}_dq_{idx}", new_linear)
 | 
					        init_contexts.append(init_empty_weights())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with ContextManagers(init_contexts):
 | 
				
			||||||
 | 
					        for idx, weight in enumerate(weight_split):
 | 
				
			||||||
 | 
					            new_linear = torch.nn.Linear(weight.size(1),
 | 
				
			||||||
 | 
					                                         weight.size(0),
 | 
				
			||||||
 | 
					                                         bias=False if bias is None else True)
 | 
				
			||||||
 | 
					            new_linear.bias = bias
 | 
				
			||||||
 | 
					            new_linear.weight = torch.nn.Parameter(weight.contiguous(), requires_grad=False)
 | 
				
			||||||
 | 
					            linear_list.add_module(f"{module_name}_dq_{idx}", new_linear)
 | 
				
			||||||
    return linear_list
 | 
					    return linear_list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down_proj=2):
 | 
					def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down_proj=2,
 | 
				
			||||||
 | 
					                  load=False):
 | 
				
			||||||
    from transformers.models.qwen2.modeling_qwen2 import Qwen2MLP, Qwen2Attention
 | 
					    from transformers.models.qwen2.modeling_qwen2 import Qwen2MLP, Qwen2Attention
 | 
				
			||||||
    from transformers.models.llama.modeling_llama import LlamaMLP, LlamaAttention
 | 
					    from transformers.models.llama.modeling_llama import LlamaMLP, LlamaAttention
 | 
				
			||||||
    attn_module_names = ["q_proj", "k_proj", "v_proj", "o_proj"]
 | 
					    attn_module_names = ["q_proj", "k_proj", "v_proj", "o_proj"]
 | 
				
			||||||
| 
						 | 
					@ -69,7 +79,8 @@ def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down
 | 
				
			||||||
    if isinstance(module, (Qwen2Attention, LlamaAttention)):
 | 
					    if isinstance(module, (Qwen2Attention, LlamaAttention)):
 | 
				
			||||||
        for name in attn_module_names:
 | 
					        for name in attn_module_names:
 | 
				
			||||||
            setattr(module, f"{name}_dq_list", split_linear(getattr(module, name), name,
 | 
					            setattr(module, f"{name}_dq_list", split_linear(getattr(module, name), name,
 | 
				
			||||||
                                                            n_splits=n_splits_hidden_size))
 | 
					                                                            n_splits=n_splits_hidden_size,
 | 
				
			||||||
 | 
					                                                            load=load))
 | 
				
			||||||
            delattr(module, name)
 | 
					            delattr(module, name)
 | 
				
			||||||
    elif isinstance(module, (Qwen2MLP, LlamaMLP)):
 | 
					    elif isinstance(module, (Qwen2MLP, LlamaMLP)):
 | 
				
			||||||
        for name in mlp_module_names:
 | 
					        for name in mlp_module_names:
 | 
				
			||||||
| 
						 | 
					@ -77,5 +88,6 @@ def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down
 | 
				
			||||||
            if name == 'down_proj':
 | 
					            if name == 'down_proj':
 | 
				
			||||||
                n_splits_mlp = n_splits_down_proj
 | 
					                n_splits_mlp = n_splits_down_proj
 | 
				
			||||||
            setattr(module, f"{name}_dq_list", split_linear(getattr(module, name), name,
 | 
					            setattr(module, f"{name}_dq_list", split_linear(getattr(module, name), name,
 | 
				
			||||||
                                                            n_splits=n_splits_mlp))
 | 
					                                                            n_splits=n_splits_mlp,
 | 
				
			||||||
 | 
					                                                            load=load))
 | 
				
			||||||
            delattr(module, name)
 | 
					            delattr(module, name)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -31,7 +31,7 @@ def convert_forward(m, target_m, new_forward):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
 | 
					def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
 | 
				
			||||||
                     quantization_group_size=0):
 | 
					                     quantization_group_size=0, load=False):
 | 
				
			||||||
    if model.config.model_type == "baichuan":
 | 
					    if model.config.model_type == "baichuan":
 | 
				
			||||||
        # process NormHead module in Baichuan2 7B
 | 
					        # process NormHead module in Baichuan2 7B
 | 
				
			||||||
        if hasattr(model, 'lm_head') and model.lm_head is not None:
 | 
					        if hasattr(model, 'lm_head') and model.lm_head is not None:
 | 
				
			||||||
| 
						 | 
					@ -104,9 +104,9 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            n_splits_linear = model.config.hidden_size // quantization_group_size
 | 
					            n_splits_linear = model.config.hidden_size // quantization_group_size
 | 
				
			||||||
            n_splits_down_proj = model.config.intermediate_size // quantization_group_size
 | 
					            n_splits_down_proj = model.config.intermediate_size // quantization_group_size
 | 
				
			||||||
 | 
					 | 
				
			||||||
        model.apply(lambda m: split_linears(m, n_splits_hidden_size=n_splits_linear,
 | 
					        model.apply(lambda m: split_linears(m, n_splits_hidden_size=n_splits_linear,
 | 
				
			||||||
                                            n_splits_down_proj=n_splits_down_proj))
 | 
					                                            n_splits_down_proj=n_splits_down_proj,
 | 
				
			||||||
 | 
					                                            load=load))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if quantization_group_size != 0:
 | 
					        if quantization_group_size != 0:
 | 
				
			||||||
            split_num = model.config.hidden_size // quantization_group_size
 | 
					            split_num = model.config.hidden_size // quantization_group_size
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue