diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py index 7a22d567..adfb611f 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py @@ -51,6 +51,8 @@ if __name__ == "__main__": parser.add_argument("--quantization-group-size", type=int, default=0) parser.add_argument('--low-bit', type=str, default="sym_int4", help='Low bit optimizations that will be applied to the model.') + parser.add_argument("--keep-ir", action="store_true") + parser.add_argument("--disable-compile-blob", action="store_true") args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -66,7 +68,9 @@ if __name__ == "__main__": torch_dtype=torch.float16, attn_implementation="eager", trust_remote_code=True, - save_directory=save_dir) + save_directory=save_dir, + keep_ir=args.keep_ir, + compile_blob=not args.disable_compile_blob) t1 = time.perf_counter() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 0781f635..ae6d7a73 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -139,8 +139,10 @@ class _BaseAutoModelClass: mock_device = kwargs.pop('device', None) # For mock on CPU convert_model = kwargs.pop('convert_model', False) save_directory = kwargs.pop('save_directory', None) - fuse_layers = kwargs.pop('fuse_layers', None) - imatrix_file = kwargs.pop('imatrix_file', None) + fuse_layers = kwargs.pop("fuse_layers", None) + imatrix_file = kwargs.pop("imatrix_file", None) + keep_ir = kwargs.pop("keep_ir", False) + compile_blob = kwargs.pop("compile_blob", True) if imatrix_file is not None: imatrix_data = load_imatrix_data(imatrix_file) @@ -236,6 +238,8 @@ class _BaseAutoModelClass: "fuse_layers": fuse_layers, "imatrix_data": imatrix_data, "skip_npu_logic": mock_device == "dummy", + "keep_ir": keep_ir, + "compile_blob": compile_blob, } # Dummy will skip npu related logic and save the quantized model if mock_device == "dummy": @@ -280,9 +284,14 @@ class _BaseAutoModelClass: fuse_layers = kwargs.pop('fuse_layers', None) imatrix_data = kwargs.pop('imatrix_data', None) skip_npu_logic = kwargs.pop("skip_npu_logic", False) + keep_ir = kwargs.pop("keep_ir", False) + compile_blob = kwargs.pop("compile_blob", True) + invalidInputError(save_directory is not None, "Please provide the path to save converted model " "through `save_directory`.") + invalidInputError(keep_ir or compile_blob, + "Please save blob or save IR either.") if hasattr(model, "llm"): llm = model.llm @@ -323,7 +332,9 @@ class _BaseAutoModelClass: qtype=qtype, save_directory=save_directory, fuse_layers=fuse_layers, - has_llm=hasattr(model, "llm") + has_llm=hasattr(model, "llm"), + keep_ir=keep_ir, + compile_blob=compile_blob ) else: optimize_llm( @@ -346,7 +357,9 @@ class _BaseAutoModelClass: qtype=qtype, convert_model=convert_model, save_directory=save_directory, - fuse_layers=fuse_layers) + fuse_layers=fuse_layers, + keep_ir=keep_ir, + compile_blob=compile_blob) model.save_low_bit = types.MethodType(save_low_bit, model) model.save_low_bit(save_directory) logger.info(f"Converted model has already saved to {save_directory}.") diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 3dece12f..f3f4f49c 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -450,7 +450,9 @@ def optimize_llm_single_process( qtype: str, save_directory: str, fuse_layers: int=None, - has_llm: bool=False + has_llm: bool=False, + keep_ir: bool=False, + compile_blob: bool=True ): from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm from .npu_llm_cpp import load_model_from_file @@ -463,7 +465,9 @@ def optimize_llm_single_process( qtype=qtype, convert_model=True, save_directory=save_directory, - fuse_layers=fuse_layers) + fuse_layers=fuse_layers, + keep_ir=keep_ir, + compile_blob=compile_blob) try: model_ptr = load_model_from_file(save_directory) model.kv_len = kv_len diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 16f8a724..24bbca21 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -196,7 +196,9 @@ def convert_llm(model: torch.nn.Module, qtype: str, convert_model: bool=False, save_directory: str=None, - fuse_layers: int=None): + fuse_layers: int=None, + keep_ir: bool=False, + compile_blob: bool=True): # whether to set layernorm weight as const layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" if group_size == 0: @@ -220,7 +222,9 @@ def convert_llm(model: torch.nn.Module, n_splits_down_proj, group_size, save_directory, - fuse_layers=fuse_layers) + fuse_layers=fuse_layers, + keep_ir=keep_ir, + compile_blob=compile_blob) return 0 if model.config.model_type == "llama": with tempfile.TemporaryDirectory() as temp_dir: @@ -428,7 +432,9 @@ def convert_llm_for_deploy(model: torch.nn.Module, n_splits_down_proj: int, group_size: int, save_directory: str=None, - fuse_layers: int=None): + fuse_layers: int=None, + keep_ir: bool=False, + compile_blob: bool=True): if not os.path.exists(save_directory): os.mkdir(save_directory) weight_dir = os.path.join(save_directory, "model_weights") @@ -479,14 +485,17 @@ def convert_llm_for_deploy(model: torch.nn.Module, # save fused_layers blobs of fused decoder layers convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, save_directory, weight_dir, transpose_value_cache, kv_len, - group_size, layernorm_const, "decode") + group_size, layernorm_const, "decode", + keep_ir=keep_ir, compile_blob=compile_blob) # save blob of single prefill layer convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj, save_directory, weight_dir, transpose_value_cache, max_prompt_len, - group_size, layernorm_const, "prefill") + group_size, layernorm_const, "prefill", + keep_ir=keep_ir, compile_blob=compile_blob) # save blob of lmhead and bin of embedding convert_lm_head_and_embedding(model, save_directory, weight_dir, - convert_model=True, group_size=group_size) + convert_model=True, group_size=group_size, + keep_ir=keep_ir, compile_blob=compile_blob) elif model.config.model_type == "llama": embedding_post = False cos_sin_input = False @@ -540,15 +549,18 @@ def convert_llm_for_deploy(model: torch.nn.Module, convert_lm_head_and_embedding(model, n_splits_linear, save_directory, weight_dir, convert_model=True, - max_prompt_len=max_prompt_len) + max_prompt_len=max_prompt_len, + keep_ir=keep_ir, compile_blob=compile_blob) # save fused_layers blobs of fused decoder layers convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, save_directory, weight_dir, transpose_value_cache, kv_len, - group_size, layernorm_const, "decode") + group_size, layernorm_const, "decode", + keep_ir=keep_ir, compile_blob=compile_blob) # save blob of single prefill layer convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj, save_directory, weight_dir, transpose_value_cache, max_prompt_len, - group_size, layernorm_const, "prefill") + group_size, layernorm_const, "prefill", + keep_ir=keep_ir, compile_blob=compile_blob) elif model.config.model_type == "minicpm": if group_size == 0: fused_layers = 4 if fuse_layers is None else fuse_layers @@ -577,16 +589,19 @@ def convert_llm_for_deploy(model: torch.nn.Module, # save fused_layers blobs of fused decoder layers convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, save_directory, weight_dir, transpose_value_cache, kv_len, - group_size, layernorm_const, "decode") + group_size, layernorm_const, "decode", + keep_ir=keep_ir, compile_blob=compile_blob) # save blob of single prefill layer convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj, save_directory, weight_dir, transpose_value_cache, max_prompt_len, - group_size, layernorm_const, "prefill") + group_size, layernorm_const, "prefill", + keep_ir=keep_ir, compile_blob=compile_blob) # save blob of lmhead and bin of embedding and embedding_post convert_lm_head_and_embedding(model, n_splits_linear, save_directory, weight_dir, convert_model=True, - max_prompt_len=max_prompt_len) + max_prompt_len=max_prompt_len, + keep_ir=keep_ir, compile_blob=compile_blob) model.config.update(update_dict) model.config.save_pretrained(save_directory) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py index aebff3d6..dea8c0f3 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py @@ -123,7 +123,8 @@ class Llama32PostEmbedding(NNFactory): def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, - convert_model=False, max_prompt_len=1): + convert_model=False, max_prompt_len=1, + keep_ir=False, compile_blob=True): num_heads = model.model.layers[0].self_attn.num_heads num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads head_dim = model.model.layers[0].self_attn.head_dim @@ -175,7 +176,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, asym=asym ) last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir, - True, False) + keep_ir=keep_ir, compile_blob=compile_blob) + os.remove(os.path.join(temp_dir, "lm_head.bin")) # save weights bins files if n_splits_linear == 1: @@ -211,7 +213,9 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, first_blob_path = None else: first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding", - temp_dir, True, False) + temp_dir, keep_ir=keep_ir, + compile_blob=compile_blob) + os.remove(os.path.join(temp_dir, "embedding.bin")) else: # llama-3.2-3B & llama-3.2-1B embedding_layer = model.model.embed_tokens @@ -235,22 +239,28 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, attention_scaling=attention_scaling, input_len=1) update_names_of_IR_and_export_blob(embedding_post, "embedding_post", - temp_dir, True, False) + temp_dir, keep_ir=keep_ir, compile_blob=compile_blob) embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq, attention_scaling=attention_scaling, input_len=max_prompt_len) update_names_of_IR_and_export_blob(embedding_post_prefill, "embedding_post_prefill", - temp_dir, True, False) + temp_dir, keep_ir=keep_ir, compile_blob=compile_blob) + os.remove(os.path.join(temp_dir, "embedding_post.bin")) + os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin")) else: first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding", - temp_dir) + temp_dir, keep_ir=keep_ir, + compile_blob=compile_blob) + os.remove(os.path.join(temp_dir, "embedding.bin")) + return first_blob_path, last_blob_path def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, temp_dir, weight_dir, transpose_value_cache, kv_len, group_size, - layernorm_const, mode="decode"): + layernorm_const, mode="decode", + keep_ir=False, compile_blob=True): num_heads = model.model.layers[0].self_attn.num_heads num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads head_dim = model.model.layers[0].self_attn.head_dim @@ -317,8 +327,9 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, decoder_name, temp_dir, - True, False, + keep_ir=keep_ir, compile_blob=compile_blob, npu_dpu_groups=npu_dpu_groups) + os.remove(os.path.join(temp_dir, decoder_name + ".bin")) if mode == "decode": if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"): @@ -364,7 +375,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, save_dir, weight_dir, transpose_value_cache, kv_len, group_size, - layernorm_const, mode="decode"): + layernorm_const, mode="decode", + keep_ir=False, compile_blob=True): num_heads = model.model.layers[0].self_attn.num_heads num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads head_dim = model.model.layers[0].self_attn.head_dim @@ -457,6 +469,7 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow update_names_of_IR_and_export_blob(fused_decoder, f"decoder_layer_{i}", save_dir, - compile_blob=True, - keep_ir=False) + keep_ir=keep_ir, + compile_blob=compile_blob) + os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin")) return 0 diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/minicpm.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/minicpm.py index 9eddce77..7b41554c 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/minicpm.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/minicpm.py @@ -162,7 +162,8 @@ class MiniCPMLMHead(LLMBaseNNFactory): def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, - convert_model=False, max_prompt_len=1): + convert_model=False, max_prompt_len=1, + keep_ir=False, compile_blob=True): num_heads = model.model.layers[0].self_attn.num_heads num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads head_dim = model.model.layers[0].self_attn.head_dim @@ -230,7 +231,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, asym=asym ) last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir, - True, True) + keep_ir=keep_ir, compile_blob=compile_blob) + os.remove(os.path.join(temp_dir, "lm_head.bin")) # save weights bins files if n_splits_linear == 1: @@ -280,22 +282,27 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, dtype=np.float16, scale_emb=model.config.scale_emb) update_names_of_IR_and_export_blob(embedding_post, "embedding_post", - temp_dir, True, False) + temp_dir, keep_ir=keep_ir, compile_blob=compile_blob) embedding_post_prefill = MiniCPMPostEmbedding(max_prompt_len, model.config.hidden_size, dtype=np.float16, scale_emb=model.config.scale_emb) update_names_of_IR_and_export_blob(embedding_post_prefill, "embedding_post_prefill", - temp_dir, True, False) + temp_dir, keep_ir=keep_ir, compile_blob=compile_blob) + os.remove(os.path.join(temp_dir, "embedding_post.bin")) + os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin")) else: first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding", - temp_dir, True, False) + temp_dir, keep_ir=keep_ir, + compile_blob=compile_blob) + os.remove(os.path.join(temp_dir, "embedding.bin")) return first_blob_path, last_blob_path def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, temp_dir, weight_dir, transpose_value_cache, kv_len, group_size, - layernorm_const, mode="decode"): + layernorm_const, mode="decode", + keep_ir=False, compile_blob=True): num_heads = model.model.layers[0].self_attn.num_heads num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads head_dim = model.model.layers[0].self_attn.head_dim @@ -353,7 +360,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, decoder_name, temp_dir, - True, True) + keep_ir=keep_ir, compile_blob=compile_blob) + os.remove(os.path.join(temp_dir, decoder_name + ".bin")) if mode == "decode": if layernorm_const: @@ -386,7 +394,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, save_dir, weight_dir, transpose_value_cache, kv_len, group_size, - layernorm_const, mode="decode"): + layernorm_const, mode="decode", + keep_ir=False, compile_blob=True): num_heads = model.model.layers[0].self_attn.num_heads num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads head_dim = model.model.layers[0].self_attn.head_dim @@ -477,6 +486,6 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d update_names_of_IR_and_export_blob(fused_decoder, f"decoder_layer_{i}", save_dir, - compile_blob=True, - keep_ir=False) + keep_ir=keep_ir, compile_blob=compile_blob) + os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin")) return 0 diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index 5137fd4a..183b71b9 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -24,7 +24,8 @@ from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead def convert_lm_head_and_embedding(model, temp_dir, weight_dir, - convert_model=False, group_size=0): + convert_model=False, group_size=0, + keep_ir=False, compile_blob=True): num_heads = model.model.layers[0].self_attn.num_heads head_dim = model.model.layers[0].self_attn.head_dim rms_norm_eps = model.config.rms_norm_eps @@ -84,7 +85,9 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir, ) last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head", - temp_dir, True, False) + temp_dir, keep_ir=keep_ir, + compile_blob=compile_blob) + os.remove(os.path.join(temp_dir, "lm_head.bin")) # save weights bins files if not isinstance(lm_head, SlicedLMHead): @@ -119,13 +122,16 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir, first_blob_path = True else: first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding", - temp_dir, True, keep_ir=True) + temp_dir, keep_ir=keep_ir, + compile_blob=compile_blob) + os.remove(os.path.join(temp_dir, "embedding.bin")) return first_blob_path, last_blob_path def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, temp_dir, weight_dir, transpose_value_cache, kv_len, group_size, - layernorm_const, mode="decode"): + layernorm_const, mode="decode", + keep_ir=False, compile_blob=True): num_heads = model.model.layers[0].self_attn.num_heads num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads head_dim = model.model.layers[0].self_attn.head_dim @@ -183,8 +189,10 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, ) rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, decoder_name, - temp_dir, True, False, + temp_dir, keep_ir=keep_ir, + compile_blob=compile_blob, npu_dpu_groups=npu_dpu_groups) + os.remove(os.path.join(temp_dir, decoder_name + ".bin")) # 0, 1, 2 are input_embed/attention_mask/position_id if mode == "decode": @@ -226,7 +234,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, save_dir, weight_dir, transpose_value_cache, kv_len, group_size, - layernorm_const, mode="decode"): + layernorm_const, mode="decode", + keep_ir=False, compile_blob=True): num_heads = model.model.layers[0].self_attn.num_heads num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads head_dim = model.model.layers[0].self_attn.head_dim @@ -330,6 +339,6 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down update_names_of_IR_and_export_blob(fused_decoder, f"decoder_layer_{i}", save_dir, - compile_blob=True, - keep_ir=False) + keep_ir=keep_ir, compile_blob=compile_blob) + os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin")) return 0