update mlp of llama (#11897)
* update mlp of llama * relax threshold of mlp test * revert code
This commit is contained in:
parent
420ce7d164
commit
4a61f7d20d
2 changed files with 11 additions and 1 deletions
|
|
@ -305,6 +305,16 @@ def llama_mlp_forward(
|
||||||
)
|
)
|
||||||
hidden_states = attn_output.view(x.shape)
|
hidden_states = attn_output.view(x.shape)
|
||||||
return hidden_states
|
return hidden_states
|
||||||
|
elif x.device.type == "xpu" and not self.training:
|
||||||
|
import xe_addons
|
||||||
|
gate = self.gate_proj(x)
|
||||||
|
up = self.up_proj(x)
|
||||||
|
xe_addons.mlp_silu_mul_inplaced(gate, up)
|
||||||
|
out = self.down_proj(gate)
|
||||||
|
if residual is not None:
|
||||||
|
return out + residual
|
||||||
|
else:
|
||||||
|
return out
|
||||||
else:
|
else:
|
||||||
a = self.act_fn(self.gate_proj(x))
|
a = self.act_fn(self.gate_proj(x))
|
||||||
b = self.up_proj(x)
|
b = self.up_proj(x)
|
||||||
|
|
|
||||||
|
|
@ -134,7 +134,7 @@ class Test_Optimize_Gpu_Model:
|
||||||
# currently only compare the output of the last mlp layer.
|
# currently only compare the output of the last mlp layer.
|
||||||
layer_before_MLP = "model.layers.31.post_attention_layernorm"
|
layer_before_MLP = "model.layers.31.post_attention_layernorm"
|
||||||
MLP_layer = "model.layers.31.mlp"
|
MLP_layer = "model.layers.31.mlp"
|
||||||
lower_bound = 0
|
lower_bound = 1e-3
|
||||||
self.run_optimize_gpu_model(Name, Model, Tokenizer, model_path, MLP_layer, layer_before_MLP, lower_bound)
|
self.run_optimize_gpu_model(Name, Model, Tokenizer, model_path, MLP_layer, layer_before_MLP, lower_bound)
|
||||||
|
|
||||||
def Llama2_7B_gpu_model(self, Name, Model, Tokenizer, model_path):
|
def Llama2_7B_gpu_model(self, Name, Model, Tokenizer, model_path):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue