update mlp of llama (#11897)

* update mlp of llama

* relax threshold of  mlp test

* revert code
This commit is contained in:
Ruonan Wang 2024-08-22 05:34:53 -07:00 committed by GitHub
parent 420ce7d164
commit 4a61f7d20d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 11 additions and 1 deletions

View file

@ -305,6 +305,16 @@ def llama_mlp_forward(
) )
hidden_states = attn_output.view(x.shape) hidden_states = attn_output.view(x.shape)
return hidden_states return hidden_states
elif x.device.type == "xpu" and not self.training:
import xe_addons
gate = self.gate_proj(x)
up = self.up_proj(x)
xe_addons.mlp_silu_mul_inplaced(gate, up)
out = self.down_proj(gate)
if residual is not None:
return out + residual
else:
return out
else: else:
a = self.act_fn(self.gate_proj(x)) a = self.act_fn(self.gate_proj(x))
b = self.up_proj(x) b = self.up_proj(x)

View file

@ -134,7 +134,7 @@ class Test_Optimize_Gpu_Model:
# currently only compare the output of the last mlp layer. # currently only compare the output of the last mlp layer.
layer_before_MLP = "model.layers.31.post_attention_layernorm" layer_before_MLP = "model.layers.31.post_attention_layernorm"
MLP_layer = "model.layers.31.mlp" MLP_layer = "model.layers.31.mlp"
lower_bound = 0 lower_bound = 1e-3
self.run_optimize_gpu_model(Name, Model, Tokenizer, model_path, MLP_layer, layer_before_MLP, lower_bound) self.run_optimize_gpu_model(Name, Model, Tokenizer, model_path, MLP_layer, layer_before_MLP, lower_bound)
def Llama2_7B_gpu_model(self, Name, Model, Tokenizer, model_path): def Llama2_7B_gpu_model(self, Name, Model, Tokenizer, model_path):