New convert support for C++ NPU (#12430)
* initial commit * fix * fix style * fix style * fix * fix
This commit is contained in:
		
							parent
							
								
									c089b6c10d
								
							
						
					
					
						commit
						4ffa6c752c
					
				
					 4 changed files with 180 additions and 19 deletions
				
			
		| 
						 | 
					@ -63,7 +63,7 @@ if __name__ == "__main__":
 | 
				
			||||||
                                                 transpose_value_cache=not args.disable_transpose_value_cache,
 | 
					                                                 transpose_value_cache=not args.disable_transpose_value_cache,
 | 
				
			||||||
                                                 mixed_precision=True,
 | 
					                                                 mixed_precision=True,
 | 
				
			||||||
                                                 trust_remote_code=True,
 | 
					                                                 trust_remote_code=True,
 | 
				
			||||||
                                                 compile_full_model=True,
 | 
					                                                 convert_model=True,
 | 
				
			||||||
                                                 save_directory=save_dir)
 | 
					                                                 save_directory=save_dir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -134,7 +134,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        mixed_precision = kwargs.pop('mixed_precision', False)
 | 
					        mixed_precision = kwargs.pop('mixed_precision', False)
 | 
				
			||||||
        quantization_group_size = kwargs.pop("quantization_group_size", 0)
 | 
					        quantization_group_size = kwargs.pop("quantization_group_size", 0)
 | 
				
			||||||
        mock_device = kwargs.pop('device', None)  # For mock on CPU
 | 
					        mock_device = kwargs.pop('device', None)  # For mock on CPU
 | 
				
			||||||
        compile_full_model = kwargs.pop('compile_full_model', False)
 | 
					        convert_model = kwargs.pop('convert_model', False)
 | 
				
			||||||
        save_directory = kwargs.pop('save_directory', None)
 | 
					        save_directory = kwargs.pop('save_directory', None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        invalidInputError(
 | 
					        invalidInputError(
 | 
				
			||||||
| 
						 | 
					@ -202,7 +202,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
                    "inter_pp": inter_pp,
 | 
					                    "inter_pp": inter_pp,
 | 
				
			||||||
                    "intra_pp": intra_pp,
 | 
					                    "intra_pp": intra_pp,
 | 
				
			||||||
                    "transpose_value_cache": transpose_value_cache,
 | 
					                    "transpose_value_cache": transpose_value_cache,
 | 
				
			||||||
                    "compile_full_model": compile_full_model,
 | 
					                    "convert_model": convert_model,
 | 
				
			||||||
                    "save_directory": save_directory,
 | 
					                    "save_directory": save_directory,
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                model = cls.optimize_npu_model(*args, **optimize_kwargs)
 | 
					                model = cls.optimize_npu_model(*args, **optimize_kwargs)
 | 
				
			||||||
| 
						 | 
					@ -241,7 +241,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        inter_pp = kwargs.pop("inter_pp", None)
 | 
					        inter_pp = kwargs.pop("inter_pp", None)
 | 
				
			||||||
        intra_pp = kwargs.pop("intra_pp", None)
 | 
					        intra_pp = kwargs.pop("intra_pp", None)
 | 
				
			||||||
        transpose_value_cache = kwargs.pop("transpose_value_cache", True)
 | 
					        transpose_value_cache = kwargs.pop("transpose_value_cache", True)
 | 
				
			||||||
        compile_full_model = kwargs.pop('compile_full_model', False)
 | 
					        convert_model = kwargs.pop('convert_model', False)
 | 
				
			||||||
        save_directory = kwargs.pop('save_directory', None)
 | 
					        save_directory = kwargs.pop('save_directory', None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if hasattr(model, "llm"):
 | 
					        if hasattr(model, "llm"):
 | 
				
			||||||
| 
						 | 
					@ -280,7 +280,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
                        max_prompt_len=max_prompt_len,
 | 
					                        max_prompt_len=max_prompt_len,
 | 
				
			||||||
                        transpose_value_cache=transpose_value_cache,
 | 
					                        transpose_value_cache=transpose_value_cache,
 | 
				
			||||||
                        group_size=quantization_group_size,
 | 
					                        group_size=quantization_group_size,
 | 
				
			||||||
                        compile_full_model=compile_full_model,
 | 
					                        convert_model=convert_model,
 | 
				
			||||||
                        save_directory=save_directory)
 | 
					                        save_directory=save_directory)
 | 
				
			||||||
        model.save_low_bit = types.MethodType(save_low_bit, model)
 | 
					        model.save_low_bit = types.MethodType(save_low_bit, model)
 | 
				
			||||||
        return model
 | 
					        return model
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -193,7 +193,7 @@ def convert_llm(model: torch.nn.Module,
 | 
				
			||||||
                max_prompt_len: int,
 | 
					                max_prompt_len: int,
 | 
				
			||||||
                transpose_value_cache: bool,
 | 
					                transpose_value_cache: bool,
 | 
				
			||||||
                group_size: int,
 | 
					                group_size: int,
 | 
				
			||||||
                compile_full_model: bool=False,
 | 
					                convert_model: bool=False,
 | 
				
			||||||
                save_directory: str=None):
 | 
					                save_directory: str=None):
 | 
				
			||||||
    # whether to set layernorm weight as const
 | 
					    # whether to set layernorm weight as const
 | 
				
			||||||
    layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
 | 
					    layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
 | 
				
			||||||
| 
						 | 
					@ -203,6 +203,16 @@ def convert_llm(model: torch.nn.Module,
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        n_splits_linear = model.config.hidden_size // group_size
 | 
					        n_splits_linear = model.config.hidden_size // group_size
 | 
				
			||||||
        n_splits_down_proj = model.config.intermediate_size // group_size
 | 
					        n_splits_down_proj = model.config.intermediate_size // group_size
 | 
				
			||||||
 | 
					    if convert_model:
 | 
				
			||||||
 | 
					        convert_llm_for_deploy(model,
 | 
				
			||||||
 | 
					                               kv_len,
 | 
				
			||||||
 | 
					                               max_prompt_len,
 | 
				
			||||||
 | 
					                               transpose_value_cache,
 | 
				
			||||||
 | 
					                               n_splits_linear,
 | 
				
			||||||
 | 
					                               n_splits_down_proj,
 | 
				
			||||||
 | 
					                               group_size,
 | 
				
			||||||
 | 
					                               save_directory)
 | 
				
			||||||
 | 
					        return 0
 | 
				
			||||||
    if model.config.model_type == "llama":
 | 
					    if model.config.model_type == "llama":
 | 
				
			||||||
        with tempfile.TemporaryDirectory() as temp_dir:
 | 
					        with tempfile.TemporaryDirectory() as temp_dir:
 | 
				
			||||||
            weight_dir = os.path.join(temp_dir, "model_weights")
 | 
					            weight_dir = os.path.join(temp_dir, "model_weights")
 | 
				
			||||||
| 
						 | 
					@ -340,7 +350,7 @@ def convert_llm(model: torch.nn.Module,
 | 
				
			||||||
            from .qwen import convert_qwen_layer, convert_lm_head_and_embedding
 | 
					            from .qwen import convert_qwen_layer, convert_lm_head_and_embedding
 | 
				
			||||||
            first_blob_path, last_blob_path = convert_lm_head_and_embedding(model, n_splits_linear,
 | 
					            first_blob_path, last_blob_path = convert_lm_head_and_embedding(model, n_splits_linear,
 | 
				
			||||||
                                                                            temp_dir, weight_dir,
 | 
					                                                                            temp_dir, weight_dir,
 | 
				
			||||||
                                                                            compile_full_model)
 | 
					                                                                            convert_model)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            param_list = []
 | 
					            param_list = []
 | 
				
			||||||
            for layer_idx in range(0, layer_num):
 | 
					            for layer_idx in range(0, layer_num):
 | 
				
			||||||
| 
						 | 
					@ -350,11 +360,6 @@ def convert_llm(model: torch.nn.Module,
 | 
				
			||||||
            with Pool() as pool:
 | 
					            with Pool() as pool:
 | 
				
			||||||
                result = pool.starmap(convert_qwen_layer, param_list)
 | 
					                result = pool.starmap(convert_qwen_layer, param_list)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if compile_full_model:
 | 
					 | 
				
			||||||
                convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
 | 
					 | 
				
			||||||
                                   temp_dir, weight_dir, transpose_value_cache, max_prompt_len,
 | 
					 | 
				
			||||||
                                   group_size, layernorm_const, "prefill")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # Prefill Runner
 | 
					            # Prefill Runner
 | 
				
			||||||
            from ipex_llm.transformers.npu_models.convert_mp import convert_qwen
 | 
					            from ipex_llm.transformers.npu_models.convert_mp import convert_qwen
 | 
				
			||||||
            convert_qwen(model,
 | 
					            convert_qwen(model,
 | 
				
			||||||
| 
						 | 
					@ -403,3 +408,48 @@ def convert_llm(model: torch.nn.Module,
 | 
				
			||||||
    import types
 | 
					    import types
 | 
				
			||||||
    model.generate = types.MethodType(generate, model)
 | 
					    model.generate = types.MethodType(generate, model)
 | 
				
			||||||
    return model
 | 
					    return model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def convert_llm_for_deploy(model: torch.nn.Module,
 | 
				
			||||||
 | 
					                           kv_len: int,
 | 
				
			||||||
 | 
					                           max_prompt_len: int,
 | 
				
			||||||
 | 
					                           transpose_value_cache: bool,
 | 
				
			||||||
 | 
					                           n_splits_linear: int,
 | 
				
			||||||
 | 
					                           n_splits_down_proj: int,
 | 
				
			||||||
 | 
					                           group_size: int,
 | 
				
			||||||
 | 
					                           save_directory: str=None):
 | 
				
			||||||
 | 
					    os.mkdir(save_directory)
 | 
				
			||||||
 | 
					    weight_dir = os.path.join(save_directory, "model_weights")
 | 
				
			||||||
 | 
					    os.mkdir(weight_dir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if model.config.model_type == "qwen2":
 | 
				
			||||||
 | 
					        layernorm_const = True
 | 
				
			||||||
 | 
					        if model.config.hidden_size == 1536:
 | 
				
			||||||
 | 
					            # Qwen2-1.5B-Instruct
 | 
				
			||||||
 | 
					            fused_layers = 1
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            fused_layers = 2
 | 
				
			||||||
 | 
					        update_dict = {"kv_len": kv_len,
 | 
				
			||||||
 | 
					                       "num_head": model.model.layers[0].self_attn.num_heads,
 | 
				
			||||||
 | 
					                       "head_dim": model.model.layers[0].self_attn.head_dim,
 | 
				
			||||||
 | 
					                       "transpose_value_cache": transpose_value_cache,
 | 
				
			||||||
 | 
					                       "max_prompt_len": max_prompt_len,
 | 
				
			||||||
 | 
					                       "layernorm_const": layernorm_const,
 | 
				
			||||||
 | 
					                       "group_size":  group_size,
 | 
				
			||||||
 | 
					                       "fused_layers": fused_layers}
 | 
				
			||||||
 | 
					        model.config.update(update_dict)
 | 
				
			||||||
 | 
					        model.config.save_pretrained(save_directory)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        from .qwen import convert_qwen_layer, convert_fused_qwen_layer
 | 
				
			||||||
 | 
					        from .qwen import convert_lm_head_and_embedding
 | 
				
			||||||
 | 
					        # save fused_layers blobs of fused decoder layers
 | 
				
			||||||
 | 
					        convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
 | 
					                                 save_directory, weight_dir, transpose_value_cache, kv_len,
 | 
				
			||||||
 | 
					                                 group_size, layernorm_const, "decode")
 | 
				
			||||||
 | 
					        # save blob of single prefill layer
 | 
				
			||||||
 | 
					        convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
 | 
					                           save_directory, weight_dir, transpose_value_cache, max_prompt_len,
 | 
				
			||||||
 | 
					                           group_size, layernorm_const, "prefill")
 | 
				
			||||||
 | 
					        # save blob of lmhead and bin of embedding
 | 
				
			||||||
 | 
					        convert_lm_head_and_embedding(model, n_splits_linear,
 | 
				
			||||||
 | 
					                                      save_directory, weight_dir, True)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -23,7 +23,7 @@ from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
					def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
				
			||||||
                                  compile_full_model=False):
 | 
					                                  convert_model=False):
 | 
				
			||||||
    num_heads = model.model.layers[0].self_attn.num_heads
 | 
					    num_heads = model.model.layers[0].self_attn.num_heads
 | 
				
			||||||
    head_dim = model.model.layers[0].self_attn.head_dim
 | 
					    head_dim = model.model.layers[0].self_attn.head_dim
 | 
				
			||||||
    rms_norm_eps = model.config.rms_norm_eps
 | 
					    rms_norm_eps = model.config.rms_norm_eps
 | 
				
			||||||
| 
						 | 
					@ -60,7 +60,7 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
 | 
					    last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
 | 
				
			||||||
                                                        temp_dir, True, True)
 | 
					                                                        temp_dir, True, False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # save weights bins files
 | 
					    # save weights bins files
 | 
				
			||||||
    if not isinstance(lm_head, SlicedLMHead):
 | 
					    if not isinstance(lm_head, SlicedLMHead):
 | 
				
			||||||
| 
						 | 
					@ -83,11 +83,13 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 | 
				
			||||||
        dtype=np.float16,
 | 
					        dtype=np.float16,
 | 
				
			||||||
        input_length=1,
 | 
					        input_length=1,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
 | 
					    if convert_model:
 | 
				
			||||||
                                                         temp_dir, True, keep_ir=True)
 | 
					 | 
				
			||||||
    if compile_full_model:
 | 
					 | 
				
			||||||
        bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
 | 
					        bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
 | 
				
			||||||
        embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
 | 
					        embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
 | 
				
			||||||
 | 
					        first_blob_path = True
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
 | 
				
			||||||
 | 
					                                                             temp_dir, True, keep_ir=True)
 | 
				
			||||||
    return first_blob_path, last_blob_path
 | 
					    return first_blob_path, last_blob_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -138,8 +140,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        input_len = kv_len
 | 
					        input_len = kv_len
 | 
				
			||||||
        decoder_name = "decoder_layer_prefill"
 | 
					        decoder_name = "decoder_layer_prefill"
 | 
				
			||||||
        compile = False
 | 
					        compile = True
 | 
				
			||||||
        keep_ir = True
 | 
					        keep_ir = False
 | 
				
			||||||
    single_decoder = LowBitQwenMultiDecoderlayer(
 | 
					    single_decoder = LowBitQwenMultiDecoderlayer(
 | 
				
			||||||
        [1, input_len, num_heads * head_dim],
 | 
					        [1, input_len, num_heads * head_dim],
 | 
				
			||||||
        input_layernorm_weights=None,
 | 
					        input_layernorm_weights=None,
 | 
				
			||||||
| 
						 | 
					@ -190,3 +192,112 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
            scale.numpy().tofile(bin_file)
 | 
					            scale.numpy().tofile(bin_file)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    del single_decoder
 | 
					    del single_decoder
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
				
			||||||
 | 
					                             save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
 | 
				
			||||||
 | 
					                             layernorm_const, mode="decode"):
 | 
				
			||||||
 | 
					    num_heads = model.model.layers[0].self_attn.num_heads
 | 
				
			||||||
 | 
					    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
 | 
				
			||||||
 | 
					    head_dim = model.model.layers[0].self_attn.head_dim
 | 
				
			||||||
 | 
					    intermediate_size = model.config.intermediate_size
 | 
				
			||||||
 | 
					    rms_norm_eps = model.config.rms_norm_eps
 | 
				
			||||||
 | 
					    layer_num = len(model.model.layers)
 | 
				
			||||||
 | 
					    fused_layer_num = layer_num // fused_layers
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer
 | 
				
			||||||
 | 
					    for i in range(fused_layers):
 | 
				
			||||||
 | 
					        layer_start = i * fused_layer_num
 | 
				
			||||||
 | 
					        layer_end = min((i + 1) * fused_layer_num, layer_num)
 | 
				
			||||||
 | 
					        layer_weights = []
 | 
				
			||||||
 | 
					        input_layer_norm_weights = []
 | 
				
			||||||
 | 
					        post_attn_layernorm_weights = []
 | 
				
			||||||
 | 
					        q_biases = []
 | 
				
			||||||
 | 
					        k_biases = []
 | 
				
			||||||
 | 
					        v_biases = []
 | 
				
			||||||
 | 
					        layer_indexs = range(layer_start, layer_end)
 | 
				
			||||||
 | 
					        n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list)
 | 
				
			||||||
 | 
					        n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list)
 | 
				
			||||||
 | 
					        for layer_idx in layer_indexs:
 | 
				
			||||||
 | 
					            curr_layer = model.model.layers[layer_idx]
 | 
				
			||||||
 | 
					            attn_layer = curr_layer.self_attn
 | 
				
			||||||
 | 
					            mlp_layer = curr_layer.mlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            weights = []
 | 
				
			||||||
 | 
					            for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
 | 
				
			||||||
 | 
					                               attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
 | 
				
			||||||
 | 
					                               mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
 | 
				
			||||||
 | 
					                               mlp_layer.down_proj_dq_list]:
 | 
				
			||||||
 | 
					                l_weights = []
 | 
				
			||||||
 | 
					                scales = []
 | 
				
			||||||
 | 
					                for l in layer_list:
 | 
				
			||||||
 | 
					                    l_weights.append(l.weight)
 | 
				
			||||||
 | 
					                    scales.append(l.scale)
 | 
				
			||||||
 | 
					                weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
 | 
				
			||||||
 | 
					            cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
 | 
				
			||||||
 | 
					            layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
 | 
				
			||||||
 | 
					            layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            layer_weights.extend(weights)
 | 
				
			||||||
 | 
					            input_layer_norm_weights.append(layer_norm_0)
 | 
				
			||||||
 | 
					            post_attn_layernorm_weights.append(layer_norm_1)
 | 
				
			||||||
 | 
					            q_biases.append(attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16))
 | 
				
			||||||
 | 
					            k_biases.append(attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16))
 | 
				
			||||||
 | 
					            v_biases.append(attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # save weight
 | 
				
			||||||
 | 
					            input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
 | 
				
			||||||
 | 
					            post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
 | 
				
			||||||
 | 
					            layer_norm_0.data.numpy().tofile(input_lm_bin_file)
 | 
				
			||||||
 | 
					            layer_norm_1.data.numpy().tofile(post_lm_bin_file)
 | 
				
			||||||
 | 
					            st_idx = 5
 | 
				
			||||||
 | 
					            # 5 / 6 / 7 are bias
 | 
				
			||||||
 | 
					            q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
 | 
				
			||||||
 | 
					            k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
 | 
				
			||||||
 | 
					            v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
 | 
				
			||||||
 | 
					            q_biases[-1].data.numpy().tofile(q_bias_bin_file)
 | 
				
			||||||
 | 
					            k_biases[-1].data.numpy().tofile(k_bias_bin_file)
 | 
				
			||||||
 | 
					            v_biases[-1].data.numpy().tofile(v_bias_bin_file)
 | 
				
			||||||
 | 
					            # 6, 7 are past k/v
 | 
				
			||||||
 | 
					            for idx, (weight, scale) in enumerate(weights):
 | 
				
			||||||
 | 
					                bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin")
 | 
				
			||||||
 | 
					                weight.numpy().tofile(bin_file)
 | 
				
			||||||
 | 
					                bin_file = os.path.join(weight_dir,
 | 
				
			||||||
 | 
					                                        f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin")
 | 
				
			||||||
 | 
					                scale.numpy().tofile(bin_file)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if isinstance(weights[0], tuple):
 | 
				
			||||||
 | 
					            np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8
 | 
				
			||||||
 | 
					        else:  # FP16 Linear
 | 
				
			||||||
 | 
					            np_dtype = np.float16
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        fused_decoder = LowBitQwenMultiDecoderlayer(
 | 
				
			||||||
 | 
					            [1, 1, num_heads * head_dim],
 | 
				
			||||||
 | 
					            input_layernorm_weights=input_layer_norm_weights,
 | 
				
			||||||
 | 
					            post_attn_layernorm_weights=post_attn_layernorm_weights,
 | 
				
			||||||
 | 
					            q_biases=q_biases,
 | 
				
			||||||
 | 
					            k_biases=k_biases,
 | 
				
			||||||
 | 
					            v_biases=v_biases,
 | 
				
			||||||
 | 
					            cached_cos=cached_cos,
 | 
				
			||||||
 | 
					            cached_sin=cached_sin,
 | 
				
			||||||
 | 
					            num_heads=num_heads,
 | 
				
			||||||
 | 
					            num_key_value_heads=num_key_value_heads,
 | 
				
			||||||
 | 
					            num_layers=fused_layer_num,
 | 
				
			||||||
 | 
					            max_seq_len=kv_len,
 | 
				
			||||||
 | 
					            rms_norm_eps=rms_norm_eps,
 | 
				
			||||||
 | 
					            intermediate_size=intermediate_size,
 | 
				
			||||||
 | 
					            mode=mode,
 | 
				
			||||||
 | 
					            transpose_value=transpose_value_cache,
 | 
				
			||||||
 | 
					            dtype=np_dtype,
 | 
				
			||||||
 | 
					            n_splits_linear=n_splits_linear,
 | 
				
			||||||
 | 
					            n_splits_down_proj=n_splits_down_proj,
 | 
				
			||||||
 | 
					            group_size=group_size
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        update_names_of_IR_and_export_blob(fused_decoder,
 | 
				
			||||||
 | 
					                                           f"decoder_layer_{i}",
 | 
				
			||||||
 | 
					                                           save_dir,
 | 
				
			||||||
 | 
					                                           compile_blob=True,
 | 
				
			||||||
 | 
					                                           keep_ir=False)
 | 
				
			||||||
 | 
					    return 0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue