[NPU] Add L0 support for NPU C++ (#12454)
* add L0 models support * meet review * fix style
This commit is contained in:
		
							parent
							
								
									ce6fcaa9ba
								
							
						
					
					
						commit
						281c9b0bb9
					
				
					 1 changed files with 26 additions and 12 deletions
				
			
		| 
						 | 
				
			
			@ -197,7 +197,7 @@ def convert_llm(model: torch.nn.Module,
 | 
			
		|||
                convert_model: bool=False,
 | 
			
		||||
                save_directory: str=None):
 | 
			
		||||
    # whether to set layernorm weight as const
 | 
			
		||||
    layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
 | 
			
		||||
    layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
 | 
			
		||||
    if group_size == 0:
 | 
			
		||||
        n_splits_linear = 1
 | 
			
		||||
        if qtype == "sym_int8_rtn":
 | 
			
		||||
| 
						 | 
				
			
			@ -344,7 +344,7 @@ def convert_llm(model: torch.nn.Module,
 | 
			
		|||
                invalidInputError(False,
 | 
			
		||||
                                  "False to InitLLMPipeline.")
 | 
			
		||||
    elif model.config.model_type == "qwen2":
 | 
			
		||||
        layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "0") == "1"
 | 
			
		||||
        layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "0") == "1"
 | 
			
		||||
        with tempfile.TemporaryDirectory() as temp_dir:
 | 
			
		||||
            if save_directory is not None:
 | 
			
		||||
                temp_dir = save_directory
 | 
			
		||||
| 
						 | 
				
			
			@ -426,9 +426,11 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
			
		|||
    os.mkdir(save_directory)
 | 
			
		||||
    weight_dir = os.path.join(save_directory, "model_weights")
 | 
			
		||||
    os.mkdir(weight_dir)
 | 
			
		||||
    use_level_zero = os.environ.get("IPEX_LLM_NPU_USE_LEVEL0", "0") == "1"
 | 
			
		||||
    layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
 | 
			
		||||
 | 
			
		||||
    if model.config.model_type == "qwen2":
 | 
			
		||||
        layernorm_const = True
 | 
			
		||||
        layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "0") == "1"
 | 
			
		||||
        if model.config.hidden_size == 1536:
 | 
			
		||||
            # Qwen2-1.5B-Instruct
 | 
			
		||||
            fused_layers = 1
 | 
			
		||||
| 
						 | 
				
			
			@ -447,16 +449,28 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
			
		|||
                       "weight_num": 7,
 | 
			
		||||
                       "weight_idx": 8,
 | 
			
		||||
                       "n_splits_linear": n_splits_linear,
 | 
			
		||||
                       "n_splits_down_proj": n_splits_down_proj}
 | 
			
		||||
                       "n_splits_down_proj": n_splits_down_proj,
 | 
			
		||||
                       "use_level_zero": use_level_zero}
 | 
			
		||||
        model.config.update(update_dict)
 | 
			
		||||
        model.config.save_pretrained(save_directory)
 | 
			
		||||
 | 
			
		||||
        from .qwen import convert_qwen_layer, convert_fused_qwen_layer
 | 
			
		||||
        from .qwen import convert_lm_head_and_embedding
 | 
			
		||||
        if not use_level_zero:
 | 
			
		||||
            # save fused_layers blobs of fused decoder layers
 | 
			
		||||
            convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
 | 
			
		||||
                                     save_directory, weight_dir, transpose_value_cache, kv_len,
 | 
			
		||||
                                     group_size, layernorm_const, "decode")
 | 
			
		||||
        else:
 | 
			
		||||
            # save layer_num blobs of each decoder layer
 | 
			
		||||
            layer_num = len(model.model.layers)
 | 
			
		||||
            param_list = []
 | 
			
		||||
            for layer_idx in range(0, layer_num):
 | 
			
		||||
                param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
			
		||||
                                  save_directory, weight_dir, transpose_value_cache, kv_len,
 | 
			
		||||
                                  group_size, layernorm_const))
 | 
			
		||||
            with Pool() as pool:
 | 
			
		||||
                result = pool.starmap(convert_qwen_layer, param_list)
 | 
			
		||||
        # save blob of single prefill layer
 | 
			
		||||
        convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
 | 
			
		||||
                           save_directory, weight_dir, transpose_value_cache, max_prompt_len,
 | 
			
		||||
| 
						 | 
				
			
			@ -466,7 +480,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
			
		|||
                                      save_directory, weight_dir,
 | 
			
		||||
                                      convert_model=True)
 | 
			
		||||
    elif model.config.model_type == "llama":
 | 
			
		||||
        layernorm_const = True
 | 
			
		||||
        embedding_post = False
 | 
			
		||||
        cos_sin_input = False
 | 
			
		||||
        use_prefill_sdp = False
 | 
			
		||||
| 
						 | 
				
			
			@ -499,7 +512,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
			
		|||
                       "embedding_post": embedding_post,
 | 
			
		||||
                       "cos_sin_input": cos_sin_input,
 | 
			
		||||
                       "n_splits_linear": n_splits_linear,
 | 
			
		||||
                       "n_splits_down_proj": n_splits_down_proj}
 | 
			
		||||
                       "n_splits_down_proj": n_splits_down_proj,
 | 
			
		||||
                       "use_level_zero": use_level_zero}
 | 
			
		||||
        model.config.update(update_dict)
 | 
			
		||||
        model.config.save_pretrained(save_directory)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -519,7 +533,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
			
		|||
                            save_directory, weight_dir, transpose_value_cache, max_prompt_len,
 | 
			
		||||
                            group_size, layernorm_const, "prefill")
 | 
			
		||||
    elif model.config.model_type == "minicpm":
 | 
			
		||||
        layernorm_const = True
 | 
			
		||||
        fused_layers = 4
 | 
			
		||||
        update_dict = {"kv_len": kv_len,
 | 
			
		||||
                       "num_head": model.model.layers[0].self_attn.num_heads,
 | 
			
		||||
| 
						 | 
				
			
			@ -536,7 +549,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
			
		|||
                       "model_type": "minicpm",
 | 
			
		||||
                       "embedding_post": True,
 | 
			
		||||
                       "n_splits_linear": n_splits_linear,
 | 
			
		||||
                       "n_splits_down_proj": n_splits_down_proj}
 | 
			
		||||
                       "n_splits_down_proj": n_splits_down_proj,
 | 
			
		||||
                       "use_level_zero": use_level_zero}
 | 
			
		||||
        model.config.update(update_dict)
 | 
			
		||||
        model.config.save_pretrained(save_directory)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue