[NPU] Expose parameter to control blob / IR save logic (#12767)
* update api * fix convert.py * fix style * remove unnecessary bin file * fix style
This commit is contained in:
		
							parent
							
								
									9c0daf6396
								
							
						
					
					
						commit
						094a25b740
					
				
					 7 changed files with 115 additions and 48 deletions
				
			
		| 
						 | 
					@ -51,6 +51,8 @@ if __name__ == "__main__":
 | 
				
			||||||
    parser.add_argument("--quantization-group-size", type=int, default=0)
 | 
					    parser.add_argument("--quantization-group-size", type=int, default=0)
 | 
				
			||||||
    parser.add_argument('--low-bit', type=str, default="sym_int4",
 | 
					    parser.add_argument('--low-bit', type=str, default="sym_int4",
 | 
				
			||||||
                        help='Low bit optimizations that will be applied to the model.')
 | 
					                        help='Low bit optimizations that will be applied to the model.')
 | 
				
			||||||
 | 
					    parser.add_argument("--keep-ir", action="store_true")
 | 
				
			||||||
 | 
					    parser.add_argument("--disable-compile-blob", action="store_true") 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    args = parser.parse_args()
 | 
					    args = parser.parse_args()
 | 
				
			||||||
    model_path = args.repo_id_or_model_path
 | 
					    model_path = args.repo_id_or_model_path
 | 
				
			||||||
| 
						 | 
					@ -66,7 +68,9 @@ if __name__ == "__main__":
 | 
				
			||||||
                                                 torch_dtype=torch.float16,
 | 
					                                                 torch_dtype=torch.float16,
 | 
				
			||||||
                                                 attn_implementation="eager",
 | 
					                                                 attn_implementation="eager",
 | 
				
			||||||
                                                 trust_remote_code=True,
 | 
					                                                 trust_remote_code=True,
 | 
				
			||||||
                                                 save_directory=save_dir)
 | 
					                                                 save_directory=save_dir,
 | 
				
			||||||
 | 
					                                                 keep_ir=args.keep_ir,
 | 
				
			||||||
 | 
					                                                 compile_blob=not args.disable_compile_blob)
 | 
				
			||||||
    t1 = time.perf_counter()
 | 
					    t1 = time.perf_counter()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -139,8 +139,10 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        mock_device = kwargs.pop('device', None)  # For mock on CPU
 | 
					        mock_device = kwargs.pop('device', None)  # For mock on CPU
 | 
				
			||||||
        convert_model = kwargs.pop('convert_model', False)
 | 
					        convert_model = kwargs.pop('convert_model', False)
 | 
				
			||||||
        save_directory = kwargs.pop('save_directory', None)
 | 
					        save_directory = kwargs.pop('save_directory', None)
 | 
				
			||||||
        fuse_layers = kwargs.pop('fuse_layers', None)
 | 
					        fuse_layers = kwargs.pop("fuse_layers", None)
 | 
				
			||||||
        imatrix_file = kwargs.pop('imatrix_file', None)
 | 
					        imatrix_file = kwargs.pop("imatrix_file", None)
 | 
				
			||||||
 | 
					        keep_ir = kwargs.pop("keep_ir", False)
 | 
				
			||||||
 | 
					        compile_blob = kwargs.pop("compile_blob", True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if imatrix_file is not None:
 | 
					        if imatrix_file is not None:
 | 
				
			||||||
            imatrix_data = load_imatrix_data(imatrix_file)
 | 
					            imatrix_data = load_imatrix_data(imatrix_file)
 | 
				
			||||||
| 
						 | 
					@ -236,6 +238,8 @@ class _BaseAutoModelClass:
 | 
				
			||||||
                    "fuse_layers": fuse_layers,
 | 
					                    "fuse_layers": fuse_layers,
 | 
				
			||||||
                    "imatrix_data": imatrix_data,
 | 
					                    "imatrix_data": imatrix_data,
 | 
				
			||||||
                    "skip_npu_logic": mock_device == "dummy",
 | 
					                    "skip_npu_logic": mock_device == "dummy",
 | 
				
			||||||
 | 
					                    "keep_ir": keep_ir,
 | 
				
			||||||
 | 
					                    "compile_blob": compile_blob,
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                # Dummy will skip npu related logic and save the quantized model
 | 
					                # Dummy will skip npu related logic and save the quantized model
 | 
				
			||||||
                if mock_device == "dummy":
 | 
					                if mock_device == "dummy":
 | 
				
			||||||
| 
						 | 
					@ -280,9 +284,14 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        fuse_layers = kwargs.pop('fuse_layers', None)
 | 
					        fuse_layers = kwargs.pop('fuse_layers', None)
 | 
				
			||||||
        imatrix_data = kwargs.pop('imatrix_data', None)
 | 
					        imatrix_data = kwargs.pop('imatrix_data', None)
 | 
				
			||||||
        skip_npu_logic = kwargs.pop("skip_npu_logic", False)
 | 
					        skip_npu_logic = kwargs.pop("skip_npu_logic", False)
 | 
				
			||||||
 | 
					        keep_ir = kwargs.pop("keep_ir", False)
 | 
				
			||||||
 | 
					        compile_blob = kwargs.pop("compile_blob", True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        invalidInputError(save_directory is not None,
 | 
					        invalidInputError(save_directory is not None,
 | 
				
			||||||
                          "Please provide the path to save converted model "
 | 
					                          "Please provide the path to save converted model "
 | 
				
			||||||
                          "through `save_directory`.")
 | 
					                          "through `save_directory`.")
 | 
				
			||||||
 | 
					        invalidInputError(keep_ir or compile_blob,
 | 
				
			||||||
 | 
					                          "Please save blob or save IR either.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if hasattr(model, "llm"):
 | 
					        if hasattr(model, "llm"):
 | 
				
			||||||
            llm = model.llm
 | 
					            llm = model.llm
 | 
				
			||||||
| 
						 | 
					@ -323,7 +332,9 @@ class _BaseAutoModelClass:
 | 
				
			||||||
                        qtype=qtype,
 | 
					                        qtype=qtype,
 | 
				
			||||||
                        save_directory=save_directory,
 | 
					                        save_directory=save_directory,
 | 
				
			||||||
                        fuse_layers=fuse_layers,
 | 
					                        fuse_layers=fuse_layers,
 | 
				
			||||||
                        has_llm=hasattr(model, "llm")
 | 
					                        has_llm=hasattr(model, "llm"),
 | 
				
			||||||
 | 
					                        keep_ir=keep_ir,
 | 
				
			||||||
 | 
					                        compile_blob=compile_blob
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    optimize_llm(
 | 
					                    optimize_llm(
 | 
				
			||||||
| 
						 | 
					@ -346,7 +357,9 @@ class _BaseAutoModelClass:
 | 
				
			||||||
                            qtype=qtype,
 | 
					                            qtype=qtype,
 | 
				
			||||||
                            convert_model=convert_model,
 | 
					                            convert_model=convert_model,
 | 
				
			||||||
                            save_directory=save_directory,
 | 
					                            save_directory=save_directory,
 | 
				
			||||||
                            fuse_layers=fuse_layers)
 | 
					                            fuse_layers=fuse_layers,
 | 
				
			||||||
 | 
					                            keep_ir=keep_ir,
 | 
				
			||||||
 | 
					                            compile_blob=compile_blob)
 | 
				
			||||||
            model.save_low_bit = types.MethodType(save_low_bit, model)
 | 
					            model.save_low_bit = types.MethodType(save_low_bit, model)
 | 
				
			||||||
            model.save_low_bit(save_directory)
 | 
					            model.save_low_bit(save_directory)
 | 
				
			||||||
            logger.info(f"Converted model has already saved to {save_directory}.")
 | 
					            logger.info(f"Converted model has already saved to {save_directory}.")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -450,7 +450,9 @@ def optimize_llm_single_process(
 | 
				
			||||||
    qtype: str,
 | 
					    qtype: str,
 | 
				
			||||||
    save_directory: str,
 | 
					    save_directory: str,
 | 
				
			||||||
    fuse_layers: int=None,
 | 
					    fuse_layers: int=None,
 | 
				
			||||||
    has_llm: bool=False
 | 
					    has_llm: bool=False,
 | 
				
			||||||
 | 
					    keep_ir: bool=False,
 | 
				
			||||||
 | 
					    compile_blob: bool=True
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
 | 
					    from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
 | 
				
			||||||
    from .npu_llm_cpp import load_model_from_file
 | 
					    from .npu_llm_cpp import load_model_from_file
 | 
				
			||||||
| 
						 | 
					@ -463,7 +465,9 @@ def optimize_llm_single_process(
 | 
				
			||||||
                qtype=qtype,
 | 
					                qtype=qtype,
 | 
				
			||||||
                convert_model=True,
 | 
					                convert_model=True,
 | 
				
			||||||
                save_directory=save_directory,
 | 
					                save_directory=save_directory,
 | 
				
			||||||
                fuse_layers=fuse_layers)
 | 
					                fuse_layers=fuse_layers,
 | 
				
			||||||
 | 
					                keep_ir=keep_ir,
 | 
				
			||||||
 | 
					                compile_blob=compile_blob)
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        model_ptr = load_model_from_file(save_directory)
 | 
					        model_ptr = load_model_from_file(save_directory)
 | 
				
			||||||
        model.kv_len = kv_len
 | 
					        model.kv_len = kv_len
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -196,7 +196,9 @@ def convert_llm(model: torch.nn.Module,
 | 
				
			||||||
                qtype: str,
 | 
					                qtype: str,
 | 
				
			||||||
                convert_model: bool=False,
 | 
					                convert_model: bool=False,
 | 
				
			||||||
                save_directory: str=None,
 | 
					                save_directory: str=None,
 | 
				
			||||||
                fuse_layers: int=None):
 | 
					                fuse_layers: int=None,
 | 
				
			||||||
 | 
					                keep_ir: bool=False,
 | 
				
			||||||
 | 
					                compile_blob: bool=True):
 | 
				
			||||||
    # whether to set layernorm weight as const
 | 
					    # whether to set layernorm weight as const
 | 
				
			||||||
    layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
 | 
					    layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
 | 
				
			||||||
    if group_size == 0:
 | 
					    if group_size == 0:
 | 
				
			||||||
| 
						 | 
					@ -220,7 +222,9 @@ def convert_llm(model: torch.nn.Module,
 | 
				
			||||||
                               n_splits_down_proj,
 | 
					                               n_splits_down_proj,
 | 
				
			||||||
                               group_size,
 | 
					                               group_size,
 | 
				
			||||||
                               save_directory,
 | 
					                               save_directory,
 | 
				
			||||||
                               fuse_layers=fuse_layers)
 | 
					                               fuse_layers=fuse_layers,
 | 
				
			||||||
 | 
					                               keep_ir=keep_ir,
 | 
				
			||||||
 | 
					                               compile_blob=compile_blob)
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
    if model.config.model_type == "llama":
 | 
					    if model.config.model_type == "llama":
 | 
				
			||||||
        with tempfile.TemporaryDirectory() as temp_dir:
 | 
					        with tempfile.TemporaryDirectory() as temp_dir:
 | 
				
			||||||
| 
						 | 
					@ -428,7 +432,9 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
				
			||||||
                           n_splits_down_proj: int,
 | 
					                           n_splits_down_proj: int,
 | 
				
			||||||
                           group_size: int,
 | 
					                           group_size: int,
 | 
				
			||||||
                           save_directory: str=None,
 | 
					                           save_directory: str=None,
 | 
				
			||||||
                           fuse_layers: int=None):
 | 
					                           fuse_layers: int=None,
 | 
				
			||||||
 | 
					                           keep_ir: bool=False,
 | 
				
			||||||
 | 
					                           compile_blob: bool=True):
 | 
				
			||||||
    if not os.path.exists(save_directory):
 | 
					    if not os.path.exists(save_directory):
 | 
				
			||||||
        os.mkdir(save_directory)
 | 
					        os.mkdir(save_directory)
 | 
				
			||||||
    weight_dir = os.path.join(save_directory, "model_weights")
 | 
					    weight_dir = os.path.join(save_directory, "model_weights")
 | 
				
			||||||
| 
						 | 
					@ -479,14 +485,17 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
				
			||||||
        # save fused_layers blobs of fused decoder layers
 | 
					        # save fused_layers blobs of fused decoder layers
 | 
				
			||||||
        convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
					        convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                                 save_directory, weight_dir, transpose_value_cache, kv_len,
 | 
					                                 save_directory, weight_dir, transpose_value_cache, kv_len,
 | 
				
			||||||
                                 group_size, layernorm_const, "decode")
 | 
					                                 group_size, layernorm_const, "decode",
 | 
				
			||||||
 | 
					                                 keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
        # save blob of single prefill layer
 | 
					        # save blob of single prefill layer
 | 
				
			||||||
        convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
 | 
					        convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                           save_directory, weight_dir, transpose_value_cache, max_prompt_len,
 | 
					                           save_directory, weight_dir, transpose_value_cache, max_prompt_len,
 | 
				
			||||||
                           group_size, layernorm_const, "prefill")
 | 
					                           group_size, layernorm_const, "prefill",
 | 
				
			||||||
 | 
					                           keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
        # save blob of lmhead and bin of embedding
 | 
					        # save blob of lmhead and bin of embedding
 | 
				
			||||||
        convert_lm_head_and_embedding(model, save_directory, weight_dir,
 | 
					        convert_lm_head_and_embedding(model, save_directory, weight_dir,
 | 
				
			||||||
                                      convert_model=True, group_size=group_size)
 | 
					                                      convert_model=True, group_size=group_size,
 | 
				
			||||||
 | 
					                                      keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
    elif model.config.model_type == "llama":
 | 
					    elif model.config.model_type == "llama":
 | 
				
			||||||
        embedding_post = False
 | 
					        embedding_post = False
 | 
				
			||||||
        cos_sin_input = False
 | 
					        cos_sin_input = False
 | 
				
			||||||
| 
						 | 
					@ -540,15 +549,18 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
				
			||||||
        convert_lm_head_and_embedding(model, n_splits_linear,
 | 
					        convert_lm_head_and_embedding(model, n_splits_linear,
 | 
				
			||||||
                                      save_directory, weight_dir,
 | 
					                                      save_directory, weight_dir,
 | 
				
			||||||
                                      convert_model=True,
 | 
					                                      convert_model=True,
 | 
				
			||||||
                                      max_prompt_len=max_prompt_len)
 | 
					                                      max_prompt_len=max_prompt_len,
 | 
				
			||||||
 | 
					                                      keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
        # save fused_layers blobs of fused decoder layers
 | 
					        # save fused_layers blobs of fused decoder layers
 | 
				
			||||||
        convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
					        convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                                  save_directory, weight_dir, transpose_value_cache, kv_len,
 | 
					                                  save_directory, weight_dir, transpose_value_cache, kv_len,
 | 
				
			||||||
                                  group_size, layernorm_const, "decode")
 | 
					                                  group_size, layernorm_const, "decode",
 | 
				
			||||||
 | 
					                                  keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
        # save blob of single prefill layer
 | 
					        # save blob of single prefill layer
 | 
				
			||||||
        convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
 | 
					        convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                            save_directory, weight_dir, transpose_value_cache, max_prompt_len,
 | 
					                            save_directory, weight_dir, transpose_value_cache, max_prompt_len,
 | 
				
			||||||
                            group_size, layernorm_const, "prefill")
 | 
					                            group_size, layernorm_const, "prefill",
 | 
				
			||||||
 | 
					                            keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
    elif model.config.model_type == "minicpm":
 | 
					    elif model.config.model_type == "minicpm":
 | 
				
			||||||
        if group_size == 0:
 | 
					        if group_size == 0:
 | 
				
			||||||
            fused_layers = 4 if fuse_layers is None else fuse_layers
 | 
					            fused_layers = 4 if fuse_layers is None else fuse_layers
 | 
				
			||||||
| 
						 | 
					@ -577,16 +589,19 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
				
			||||||
        # save fused_layers blobs of fused decoder layers
 | 
					        # save fused_layers blobs of fused decoder layers
 | 
				
			||||||
        convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
					        convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                                    save_directory, weight_dir, transpose_value_cache, kv_len,
 | 
					                                    save_directory, weight_dir, transpose_value_cache, kv_len,
 | 
				
			||||||
                                    group_size, layernorm_const, "decode")
 | 
					                                    group_size, layernorm_const, "decode",
 | 
				
			||||||
 | 
					                                    keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
        # save blob of single prefill layer
 | 
					        # save blob of single prefill layer
 | 
				
			||||||
        convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
 | 
					        convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                              save_directory, weight_dir, transpose_value_cache, max_prompt_len,
 | 
					                              save_directory, weight_dir, transpose_value_cache, max_prompt_len,
 | 
				
			||||||
                              group_size, layernorm_const, "prefill")
 | 
					                              group_size, layernorm_const, "prefill",
 | 
				
			||||||
 | 
					                              keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
        # save blob of lmhead and bin of embedding and embedding_post
 | 
					        # save blob of lmhead and bin of embedding and embedding_post
 | 
				
			||||||
        convert_lm_head_and_embedding(model, n_splits_linear,
 | 
					        convert_lm_head_and_embedding(model, n_splits_linear,
 | 
				
			||||||
                                      save_directory, weight_dir,
 | 
					                                      save_directory, weight_dir,
 | 
				
			||||||
                                      convert_model=True,
 | 
					                                      convert_model=True,
 | 
				
			||||||
                                      max_prompt_len=max_prompt_len)
 | 
					                                      max_prompt_len=max_prompt_len,
 | 
				
			||||||
 | 
					                                      keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    model.config.update(update_dict)
 | 
					    model.config.update(update_dict)
 | 
				
			||||||
    model.config.save_pretrained(save_directory)
 | 
					    model.config.save_pretrained(save_directory)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -123,7 +123,8 @@ class Llama32PostEmbedding(NNFactory):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
					def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
				
			||||||
                                  convert_model=False, max_prompt_len=1):
 | 
					                                  convert_model=False, max_prompt_len=1,
 | 
				
			||||||
 | 
					                                  keep_ir=False, compile_blob=True):
 | 
				
			||||||
    num_heads = model.model.layers[0].self_attn.num_heads
 | 
					    num_heads = model.model.layers[0].self_attn.num_heads
 | 
				
			||||||
    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
					    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
				
			||||||
    head_dim = model.model.layers[0].self_attn.head_dim
 | 
					    head_dim = model.model.layers[0].self_attn.head_dim
 | 
				
			||||||
| 
						 | 
					@ -175,7 +176,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
				
			||||||
        asym=asym
 | 
					        asym=asym
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
 | 
					    last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
 | 
				
			||||||
                                                        True, False)
 | 
					                                                        keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
 | 
					    os.remove(os.path.join(temp_dir, "lm_head.bin"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # save weights bins files
 | 
					    # save weights bins files
 | 
				
			||||||
    if n_splits_linear == 1:
 | 
					    if n_splits_linear == 1:
 | 
				
			||||||
| 
						 | 
					@ -211,7 +213,9 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
				
			||||||
            first_blob_path = None
 | 
					            first_blob_path = None
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
 | 
					            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
 | 
				
			||||||
                                                                 temp_dir, True, False)
 | 
					                                                                 temp_dir, keep_ir=keep_ir,
 | 
				
			||||||
 | 
					                                                                 compile_blob=compile_blob)
 | 
				
			||||||
 | 
					            os.remove(os.path.join(temp_dir, "embedding.bin"))
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        # llama-3.2-3B & llama-3.2-1B
 | 
					        # llama-3.2-3B & llama-3.2-1B
 | 
				
			||||||
        embedding_layer = model.model.embed_tokens
 | 
					        embedding_layer = model.model.embed_tokens
 | 
				
			||||||
| 
						 | 
					@ -235,22 +239,28 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
				
			||||||
                                                  attention_scaling=attention_scaling,
 | 
					                                                  attention_scaling=attention_scaling,
 | 
				
			||||||
                                                  input_len=1)
 | 
					                                                  input_len=1)
 | 
				
			||||||
            update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
 | 
					            update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
 | 
				
			||||||
                                               temp_dir, True, False)
 | 
					                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
            embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
 | 
					            embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
 | 
				
			||||||
                                                          attention_scaling=attention_scaling,
 | 
					                                                          attention_scaling=attention_scaling,
 | 
				
			||||||
                                                          input_len=max_prompt_len)
 | 
					                                                          input_len=max_prompt_len)
 | 
				
			||||||
            update_names_of_IR_and_export_blob(embedding_post_prefill,
 | 
					            update_names_of_IR_and_export_blob(embedding_post_prefill,
 | 
				
			||||||
                                               "embedding_post_prefill",
 | 
					                                               "embedding_post_prefill",
 | 
				
			||||||
                                               temp_dir, True, False)
 | 
					                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
 | 
					            os.remove(os.path.join(temp_dir, "embedding_post.bin"))
 | 
				
			||||||
 | 
					            os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
 | 
					            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
 | 
				
			||||||
                                                                 temp_dir)
 | 
					                                                                 temp_dir, keep_ir=keep_ir,
 | 
				
			||||||
 | 
					                                                                 compile_blob=compile_blob)
 | 
				
			||||||
 | 
					            os.remove(os.path.join(temp_dir, "embedding.bin"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return first_blob_path, last_blob_path
 | 
					    return first_blob_path, last_blob_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
					def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                        temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
					                        temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
				
			||||||
                        layernorm_const, mode="decode"):
 | 
					                        layernorm_const, mode="decode",
 | 
				
			||||||
 | 
					                        keep_ir=False, compile_blob=True):
 | 
				
			||||||
    num_heads = model.model.layers[0].self_attn.num_heads
 | 
					    num_heads = model.model.layers[0].self_attn.num_heads
 | 
				
			||||||
    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
					    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
				
			||||||
    head_dim = model.model.layers[0].self_attn.head_dim
 | 
					    head_dim = model.model.layers[0].self_attn.head_dim
 | 
				
			||||||
| 
						 | 
					@ -317,8 +327,9 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
    rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
 | 
					    rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
 | 
				
			||||||
                                                        decoder_name,
 | 
					                                                        decoder_name,
 | 
				
			||||||
                                                        temp_dir,
 | 
					                                                        temp_dir,
 | 
				
			||||||
                                                        True, False,
 | 
					                                                        keep_ir=keep_ir, compile_blob=compile_blob,
 | 
				
			||||||
                                                        npu_dpu_groups=npu_dpu_groups)
 | 
					                                                        npu_dpu_groups=npu_dpu_groups)
 | 
				
			||||||
 | 
					    os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if mode == "decode":
 | 
					    if mode == "decode":
 | 
				
			||||||
        if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
 | 
					        if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
 | 
				
			||||||
| 
						 | 
					@ -364,7 +375,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
					def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                              save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
					                              save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
				
			||||||
                              layernorm_const, mode="decode"):
 | 
					                              layernorm_const, mode="decode",
 | 
				
			||||||
 | 
					                              keep_ir=False, compile_blob=True):
 | 
				
			||||||
    num_heads = model.model.layers[0].self_attn.num_heads
 | 
					    num_heads = model.model.layers[0].self_attn.num_heads
 | 
				
			||||||
    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
					    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
				
			||||||
    head_dim = model.model.layers[0].self_attn.head_dim
 | 
					    head_dim = model.model.layers[0].self_attn.head_dim
 | 
				
			||||||
| 
						 | 
					@ -457,6 +469,7 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow
 | 
				
			||||||
        update_names_of_IR_and_export_blob(fused_decoder,
 | 
					        update_names_of_IR_and_export_blob(fused_decoder,
 | 
				
			||||||
                                           f"decoder_layer_{i}",
 | 
					                                           f"decoder_layer_{i}",
 | 
				
			||||||
                                           save_dir,
 | 
					                                           save_dir,
 | 
				
			||||||
                                           compile_blob=True,
 | 
					                                           keep_ir=keep_ir,
 | 
				
			||||||
                                           keep_ir=False)
 | 
					                                           compile_blob=compile_blob)
 | 
				
			||||||
 | 
					        os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
 | 
				
			||||||
    return 0
 | 
					    return 0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -162,7 +162,8 @@ class MiniCPMLMHead(LLMBaseNNFactory):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
					def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
				
			||||||
                                  convert_model=False, max_prompt_len=1):
 | 
					                                  convert_model=False, max_prompt_len=1,
 | 
				
			||||||
 | 
					                                  keep_ir=False, compile_blob=True):
 | 
				
			||||||
    num_heads = model.model.layers[0].self_attn.num_heads
 | 
					    num_heads = model.model.layers[0].self_attn.num_heads
 | 
				
			||||||
    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
					    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
				
			||||||
    head_dim = model.model.layers[0].self_attn.head_dim
 | 
					    head_dim = model.model.layers[0].self_attn.head_dim
 | 
				
			||||||
| 
						 | 
					@ -230,7 +231,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
				
			||||||
        asym=asym
 | 
					        asym=asym
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
 | 
					    last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
 | 
				
			||||||
                                                        True, True)
 | 
					                                                        keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
 | 
					    os.remove(os.path.join(temp_dir, "lm_head.bin"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # save weights bins files
 | 
					    # save weights bins files
 | 
				
			||||||
    if n_splits_linear == 1:
 | 
					    if n_splits_linear == 1:
 | 
				
			||||||
| 
						 | 
					@ -280,22 +282,27 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
				
			||||||
                                              dtype=np.float16,
 | 
					                                              dtype=np.float16,
 | 
				
			||||||
                                              scale_emb=model.config.scale_emb)
 | 
					                                              scale_emb=model.config.scale_emb)
 | 
				
			||||||
        update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
 | 
					        update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
 | 
				
			||||||
                                           temp_dir, True, False)
 | 
					                                           temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
        embedding_post_prefill = MiniCPMPostEmbedding(max_prompt_len, model.config.hidden_size,
 | 
					        embedding_post_prefill = MiniCPMPostEmbedding(max_prompt_len, model.config.hidden_size,
 | 
				
			||||||
                                                      dtype=np.float16,
 | 
					                                                      dtype=np.float16,
 | 
				
			||||||
                                                      scale_emb=model.config.scale_emb)
 | 
					                                                      scale_emb=model.config.scale_emb)
 | 
				
			||||||
        update_names_of_IR_and_export_blob(embedding_post_prefill,
 | 
					        update_names_of_IR_and_export_blob(embedding_post_prefill,
 | 
				
			||||||
                                           "embedding_post_prefill",
 | 
					                                           "embedding_post_prefill",
 | 
				
			||||||
                                           temp_dir, True, False)
 | 
					                                           temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
 | 
					        os.remove(os.path.join(temp_dir, "embedding_post.bin"))
 | 
				
			||||||
 | 
					        os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
 | 
					        first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
 | 
				
			||||||
                                                             temp_dir, True, False)
 | 
					                                                             temp_dir, keep_ir=keep_ir,
 | 
				
			||||||
 | 
					                                                             compile_blob=compile_blob)
 | 
				
			||||||
 | 
					        os.remove(os.path.join(temp_dir, "embedding.bin"))
 | 
				
			||||||
    return first_blob_path, last_blob_path
 | 
					    return first_blob_path, last_blob_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
					def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                          temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
					                          temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
				
			||||||
                          layernorm_const, mode="decode"):
 | 
					                          layernorm_const, mode="decode",
 | 
				
			||||||
 | 
					                          keep_ir=False, compile_blob=True):
 | 
				
			||||||
    num_heads = model.model.layers[0].self_attn.num_heads
 | 
					    num_heads = model.model.layers[0].self_attn.num_heads
 | 
				
			||||||
    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
					    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
				
			||||||
    head_dim = model.model.layers[0].self_attn.head_dim
 | 
					    head_dim = model.model.layers[0].self_attn.head_dim
 | 
				
			||||||
| 
						 | 
					@ -353,7 +360,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
    rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
 | 
					    rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
 | 
				
			||||||
                                                        decoder_name,
 | 
					                                                        decoder_name,
 | 
				
			||||||
                                                        temp_dir,
 | 
					                                                        temp_dir,
 | 
				
			||||||
                                                        True, True)
 | 
					                                                        keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
 | 
					    os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if mode == "decode":
 | 
					    if mode == "decode":
 | 
				
			||||||
        if layernorm_const:
 | 
					        if layernorm_const:
 | 
				
			||||||
| 
						 | 
					@ -386,7 +394,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
					def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                                save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
					                                save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
				
			||||||
                                layernorm_const, mode="decode"):
 | 
					                                layernorm_const, mode="decode",
 | 
				
			||||||
 | 
					                                keep_ir=False, compile_blob=True):
 | 
				
			||||||
    num_heads = model.model.layers[0].self_attn.num_heads
 | 
					    num_heads = model.model.layers[0].self_attn.num_heads
 | 
				
			||||||
    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
					    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
				
			||||||
    head_dim = model.model.layers[0].self_attn.head_dim
 | 
					    head_dim = model.model.layers[0].self_attn.head_dim
 | 
				
			||||||
| 
						 | 
					@ -477,6 +486,6 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d
 | 
				
			||||||
        update_names_of_IR_and_export_blob(fused_decoder,
 | 
					        update_names_of_IR_and_export_blob(fused_decoder,
 | 
				
			||||||
                                           f"decoder_layer_{i}",
 | 
					                                           f"decoder_layer_{i}",
 | 
				
			||||||
                                           save_dir,
 | 
					                                           save_dir,
 | 
				
			||||||
                                           compile_blob=True,
 | 
					                                           keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
                                           keep_ir=False)
 | 
					        os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
 | 
				
			||||||
    return 0
 | 
					    return 0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -24,7 +24,8 @@ from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
 | 
					def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
 | 
				
			||||||
                                  convert_model=False, group_size=0):
 | 
					                                  convert_model=False, group_size=0,
 | 
				
			||||||
 | 
					                                  keep_ir=False, compile_blob=True):
 | 
				
			||||||
    num_heads = model.model.layers[0].self_attn.num_heads
 | 
					    num_heads = model.model.layers[0].self_attn.num_heads
 | 
				
			||||||
    head_dim = model.model.layers[0].self_attn.head_dim
 | 
					    head_dim = model.model.layers[0].self_attn.head_dim
 | 
				
			||||||
    rms_norm_eps = model.config.rms_norm_eps
 | 
					    rms_norm_eps = model.config.rms_norm_eps
 | 
				
			||||||
| 
						 | 
					@ -84,7 +85,9 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
 | 
					    last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
 | 
				
			||||||
                                                        temp_dir, True, False)
 | 
					                                                        temp_dir, keep_ir=keep_ir,
 | 
				
			||||||
 | 
					                                                        compile_blob=compile_blob)
 | 
				
			||||||
 | 
					    os.remove(os.path.join(temp_dir, "lm_head.bin"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # save weights bins files
 | 
					    # save weights bins files
 | 
				
			||||||
    if not isinstance(lm_head, SlicedLMHead):
 | 
					    if not isinstance(lm_head, SlicedLMHead):
 | 
				
			||||||
| 
						 | 
					@ -119,13 +122,16 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
 | 
				
			||||||
        first_blob_path = True
 | 
					        first_blob_path = True
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
 | 
					        first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
 | 
				
			||||||
                                                             temp_dir, True, keep_ir=True)
 | 
					                                                             temp_dir, keep_ir=keep_ir,
 | 
				
			||||||
 | 
					                                                             compile_blob=compile_blob)
 | 
				
			||||||
 | 
					        os.remove(os.path.join(temp_dir, "embedding.bin"))
 | 
				
			||||||
    return first_blob_path, last_blob_path
 | 
					    return first_blob_path, last_blob_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
					def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                       temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
					                       temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
				
			||||||
                       layernorm_const, mode="decode"):
 | 
					                       layernorm_const, mode="decode",
 | 
				
			||||||
 | 
					                       keep_ir=False, compile_blob=True):
 | 
				
			||||||
    num_heads = model.model.layers[0].self_attn.num_heads
 | 
					    num_heads = model.model.layers[0].self_attn.num_heads
 | 
				
			||||||
    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
					    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
				
			||||||
    head_dim = model.model.layers[0].self_attn.head_dim
 | 
					    head_dim = model.model.layers[0].self_attn.head_dim
 | 
				
			||||||
| 
						 | 
					@ -183,8 +189,10 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
 | 
					    rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
 | 
				
			||||||
                                                        decoder_name,
 | 
					                                                        decoder_name,
 | 
				
			||||||
                                                        temp_dir, True, False,
 | 
					                                                        temp_dir, keep_ir=keep_ir,
 | 
				
			||||||
 | 
					                                                        compile_blob=compile_blob,
 | 
				
			||||||
                                                        npu_dpu_groups=npu_dpu_groups)
 | 
					                                                        npu_dpu_groups=npu_dpu_groups)
 | 
				
			||||||
 | 
					    os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 0, 1, 2 are input_embed/attention_mask/position_id
 | 
					    # 0, 1, 2 are input_embed/attention_mask/position_id
 | 
				
			||||||
    if mode == "decode":
 | 
					    if mode == "decode":
 | 
				
			||||||
| 
						 | 
					@ -226,7 +234,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
					def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
                             save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
					                             save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
				
			||||||
                             layernorm_const, mode="decode"):
 | 
					                             layernorm_const, mode="decode",
 | 
				
			||||||
 | 
					                             keep_ir=False, compile_blob=True):
 | 
				
			||||||
    num_heads = model.model.layers[0].self_attn.num_heads
 | 
					    num_heads = model.model.layers[0].self_attn.num_heads
 | 
				
			||||||
    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
					    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
				
			||||||
    head_dim = model.model.layers[0].self_attn.head_dim
 | 
					    head_dim = model.model.layers[0].self_attn.head_dim
 | 
				
			||||||
| 
						 | 
					@ -330,6 +339,6 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
 | 
				
			||||||
        update_names_of_IR_and_export_blob(fused_decoder,
 | 
					        update_names_of_IR_and_export_blob(fused_decoder,
 | 
				
			||||||
                                           f"decoder_layer_{i}",
 | 
					                                           f"decoder_layer_{i}",
 | 
				
			||||||
                                           save_dir,
 | 
					                                           save_dir,
 | 
				
			||||||
                                           compile_blob=True,
 | 
					                                           keep_ir=keep_ir, compile_blob=compile_blob)
 | 
				
			||||||
                                           keep_ir=False)
 | 
					        os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
 | 
				
			||||||
    return 0
 | 
					    return 0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue