[NPU] update fused layers for GW (#12459)
* update fused layers for GW * fix * fix llama condition for glm model * update
This commit is contained in:
		
							parent
							
								
									1b533a105c
								
							
						
					
					
						commit
						490bb0ca53
					
				
					 2 changed files with 42 additions and 18 deletions
				
			
		| 
						 | 
					@ -136,6 +136,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        mock_device = kwargs.pop('device', None)  # For mock on CPU
 | 
					        mock_device = kwargs.pop('device', None)  # For mock on CPU
 | 
				
			||||||
        convert_model = kwargs.pop('convert_model', False)
 | 
					        convert_model = kwargs.pop('convert_model', False)
 | 
				
			||||||
        save_directory = kwargs.pop('save_directory', None)
 | 
					        save_directory = kwargs.pop('save_directory', None)
 | 
				
			||||||
 | 
					        fuse_layers = kwargs.pop('fuse_layers', None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        invalidInputError(
 | 
					        invalidInputError(
 | 
				
			||||||
            quantization_group_size in [0, 32, 64, 128],
 | 
					            quantization_group_size in [0, 32, 64, 128],
 | 
				
			||||||
| 
						 | 
					@ -204,6 +205,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
                    "transpose_value_cache": transpose_value_cache,
 | 
					                    "transpose_value_cache": transpose_value_cache,
 | 
				
			||||||
                    "convert_model": convert_model,
 | 
					                    "convert_model": convert_model,
 | 
				
			||||||
                    "save_directory": save_directory,
 | 
					                    "save_directory": save_directory,
 | 
				
			||||||
 | 
					                    "fuse_layers": fuse_layers
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                model = cls.optimize_npu_model(*args, **optimize_kwargs)
 | 
					                model = cls.optimize_npu_model(*args, **optimize_kwargs)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
| 
						 | 
					@ -243,6 +245,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        transpose_value_cache = kwargs.pop("transpose_value_cache", True)
 | 
					        transpose_value_cache = kwargs.pop("transpose_value_cache", True)
 | 
				
			||||||
        convert_model = kwargs.pop('convert_model', False)
 | 
					        convert_model = kwargs.pop('convert_model', False)
 | 
				
			||||||
        save_directory = kwargs.pop('save_directory', None)
 | 
					        save_directory = kwargs.pop('save_directory', None)
 | 
				
			||||||
 | 
					        fuse_layers = kwargs.pop('fuse_layers', None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if hasattr(model, "llm"):
 | 
					        if hasattr(model, "llm"):
 | 
				
			||||||
            llm = model.llm
 | 
					            llm = model.llm
 | 
				
			||||||
| 
						 | 
					@ -282,7 +285,8 @@ class _BaseAutoModelClass:
 | 
				
			||||||
                        group_size=quantization_group_size,
 | 
					                        group_size=quantization_group_size,
 | 
				
			||||||
                        qtype=qtype,
 | 
					                        qtype=qtype,
 | 
				
			||||||
                        convert_model=convert_model,
 | 
					                        convert_model=convert_model,
 | 
				
			||||||
                        save_directory=save_directory)
 | 
					                        save_directory=save_directory,
 | 
				
			||||||
 | 
					                        fuse_layers=fuse_layers)
 | 
				
			||||||
        model.save_low_bit = types.MethodType(save_low_bit, model)
 | 
					        model.save_low_bit = types.MethodType(save_low_bit, model)
 | 
				
			||||||
        return model
 | 
					        return model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -195,7 +195,8 @@ def convert_llm(model: torch.nn.Module,
 | 
				
			||||||
                group_size: int,
 | 
					                group_size: int,
 | 
				
			||||||
                qtype: str,
 | 
					                qtype: str,
 | 
				
			||||||
                convert_model: bool=False,
 | 
					                convert_model: bool=False,
 | 
				
			||||||
                save_directory: str=None):
 | 
					                save_directory: str=None,
 | 
				
			||||||
 | 
					                fuse_layers: int=None):
 | 
				
			||||||
    # whether to set layernorm weight as const
 | 
					    # whether to set layernorm weight as const
 | 
				
			||||||
    layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
 | 
					    layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
 | 
				
			||||||
    if group_size == 0:
 | 
					    if group_size == 0:
 | 
				
			||||||
| 
						 | 
					@ -216,7 +217,8 @@ def convert_llm(model: torch.nn.Module,
 | 
				
			||||||
                               n_splits_linear,
 | 
					                               n_splits_linear,
 | 
				
			||||||
                               n_splits_down_proj,
 | 
					                               n_splits_down_proj,
 | 
				
			||||||
                               group_size,
 | 
					                               group_size,
 | 
				
			||||||
                               save_directory)
 | 
					                               save_directory,
 | 
				
			||||||
 | 
					                               fuse_layers=fuse_layers)
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
    if model.config.model_type == "llama":
 | 
					    if model.config.model_type == "llama":
 | 
				
			||||||
        with tempfile.TemporaryDirectory() as temp_dir:
 | 
					        with tempfile.TemporaryDirectory() as temp_dir:
 | 
				
			||||||
| 
						 | 
					@ -422,18 +424,22 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
				
			||||||
                           n_splits_linear: int,
 | 
					                           n_splits_linear: int,
 | 
				
			||||||
                           n_splits_down_proj: int,
 | 
					                           n_splits_down_proj: int,
 | 
				
			||||||
                           group_size: int,
 | 
					                           group_size: int,
 | 
				
			||||||
                           save_directory: str=None):
 | 
					                           save_directory: str=None,
 | 
				
			||||||
 | 
					                           fuse_layers: int=None):
 | 
				
			||||||
    os.mkdir(save_directory)
 | 
					    os.mkdir(save_directory)
 | 
				
			||||||
    weight_dir = os.path.join(save_directory, "model_weights")
 | 
					    weight_dir = os.path.join(save_directory, "model_weights")
 | 
				
			||||||
    os.mkdir(weight_dir)
 | 
					    os.mkdir(weight_dir)
 | 
				
			||||||
    layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
 | 
					    layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if model.config.model_type == "qwen2":
 | 
					    if model.config.model_type == "qwen2":
 | 
				
			||||||
        if model.config.hidden_size == 1536:
 | 
					        if group_size == 0:
 | 
				
			||||||
            # Qwen2-1.5B-Instruct
 | 
					            if model.config.hidden_size == 1536:
 | 
				
			||||||
            fused_layers = 1
 | 
					                # Qwen2-1.5B-Instruct
 | 
				
			||||||
 | 
					                fused_layers = 1 if fuse_layers is None else fuse_layers
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                fused_layers = 2 if fuse_layers is None else fuse_layers
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            fused_layers = 2
 | 
					            fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
 | 
				
			||||||
        update_dict = {"kv_len": kv_len,
 | 
					        update_dict = {"kv_len": kv_len,
 | 
				
			||||||
                       "num_head": model.model.layers[0].self_attn.num_heads,
 | 
					                       "num_head": model.model.layers[0].self_attn.num_heads,
 | 
				
			||||||
                       "head_dim": model.model.layers[0].self_attn.head_dim,
 | 
					                       "head_dim": model.model.layers[0].self_attn.head_dim,
 | 
				
			||||||
| 
						 | 
					@ -469,20 +475,31 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
				
			||||||
        embedding_post = False
 | 
					        embedding_post = False
 | 
				
			||||||
        cos_sin_input = False
 | 
					        cos_sin_input = False
 | 
				
			||||||
        use_prefill_sdp = False
 | 
					        use_prefill_sdp = False
 | 
				
			||||||
        if model.config.vocab_size == 32000:
 | 
					        if group_size == 0:
 | 
				
			||||||
            # for Llama2-7B
 | 
					            if model.config.intermediate_size == 11008:
 | 
				
			||||||
            fused_layers = 4
 | 
					                # for Llama2-7B
 | 
				
			||||||
            use_prefill_sdp = True
 | 
					                fused_layers = 4 if fuse_layers is None else fuse_layers
 | 
				
			||||||
        else:
 | 
					                use_prefill_sdp = True
 | 
				
			||||||
            if model.config.intermediate_size == 8192:
 | 
					            elif model.config.intermediate_size == 14336:
 | 
				
			||||||
 | 
					                # for Llama3-8B
 | 
				
			||||||
 | 
					                fused_layers = 2 if fuse_layers is None else fuse_layers
 | 
				
			||||||
 | 
					                use_prefill_sdp = True
 | 
				
			||||||
 | 
					            elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
 | 
				
			||||||
                # llama3.2 1B & # llama3.2 3B
 | 
					                # llama3.2 1B & # llama3.2 3B
 | 
				
			||||||
                embedding_post = True
 | 
					                embedding_post = True
 | 
				
			||||||
                cos_sin_input = True
 | 
					                cos_sin_input = True
 | 
				
			||||||
                fused_layers = 2
 | 
					                fused_layers = 2 if fuse_layers is None else fuse_layers
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                # for Llama3-8B
 | 
					                fused_layers = 2 if fuse_layers is None else fuse_layers
 | 
				
			||||||
                fused_layers = 2
 | 
					        else:
 | 
				
			||||||
 | 
					            if model.config.intermediate_size in [11008, 14336]:
 | 
				
			||||||
 | 
					                # for Llama2-7B & Llama3-8B
 | 
				
			||||||
                use_prefill_sdp = True
 | 
					                use_prefill_sdp = True
 | 
				
			||||||
 | 
					            elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
 | 
				
			||||||
 | 
					                # llama3.2 1B & # llama3.2 3B
 | 
				
			||||||
 | 
					                embedding_post = True
 | 
				
			||||||
 | 
					                cos_sin_input = True
 | 
				
			||||||
 | 
					            fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
 | 
				
			||||||
        update_dict = {"kv_len": kv_len,
 | 
					        update_dict = {"kv_len": kv_len,
 | 
				
			||||||
                       "num_head": model.model.layers[0].self_attn.num_heads,
 | 
					                       "num_head": model.model.layers[0].self_attn.num_heads,
 | 
				
			||||||
                       "head_dim": model.model.layers[0].self_attn.head_dim,
 | 
					                       "head_dim": model.model.layers[0].self_attn.head_dim,
 | 
				
			||||||
| 
						 | 
					@ -518,7 +535,10 @@ def convert_llm_for_deploy(model: torch.nn.Module,
 | 
				
			||||||
                            save_directory, weight_dir, transpose_value_cache, max_prompt_len,
 | 
					                            save_directory, weight_dir, transpose_value_cache, max_prompt_len,
 | 
				
			||||||
                            group_size, layernorm_const, "prefill")
 | 
					                            group_size, layernorm_const, "prefill")
 | 
				
			||||||
    elif model.config.model_type == "minicpm":
 | 
					    elif model.config.model_type == "minicpm":
 | 
				
			||||||
        fused_layers = 4
 | 
					        if group_size == 0:
 | 
				
			||||||
 | 
					            fused_layers = 4 if fuse_layers is None else fuse_layers
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
 | 
				
			||||||
        update_dict = {"kv_len": kv_len,
 | 
					        update_dict = {"kv_len": kv_len,
 | 
				
			||||||
                       "num_head": model.model.layers[0].self_attn.num_heads,
 | 
					                       "num_head": model.model.layers[0].self_attn.num_heads,
 | 
				
			||||||
                       "head_dim": model.model.layers[0].self_attn.head_dim,
 | 
					                       "head_dim": model.model.layers[0].self_attn.head_dim,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue