From 170e0072afe580b8b65c5f916c1e49174c4c02c3 Mon Sep 17 00:00:00 2001 From: Xin Qiu Date: Wed, 15 Nov 2023 15:44:56 +0800 Subject: [PATCH] chatglm2 correctness test (#9450) * chatglm2 ut * some update * chatglm2 path * fix * add print --- .../test/inference_gpu/test_optimize_model.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/python/llm/test/inference_gpu/test_optimize_model.py b/python/llm/test/inference_gpu/test_optimize_model.py index a63b5c5a..581948de 100644 --- a/python/llm/test/inference_gpu/test_optimize_model.py +++ b/python/llm/test/inference_gpu/test_optimize_model.py @@ -127,9 +127,16 @@ class Test_Optimize_Gpu_Model: else: # 'past_key_value'is of type tuple as default. for i, (t3, t4) in enumerate(zip(t1, t2)): + if model.config.architectures[0] == "ChatGLMModel" and \ + hasattr(model.config, 'padded_vocab_size') and \ + model.config.padded_vocab_size == 65024: + # chatglm2's past_key_value is expanded 16x for some speedup. + # We need to narrow it here. + t4 = t4[:, :, 15:17, :] attn_output_diff.append(t3 - t4) max_diff_tensor = [torch.max(item).item() for item in attn_output_diff] + print(max_diff_tensor) assert all(max_diff <= lower_bound for max_diff in max_diff_tensor) @@ -158,6 +165,18 @@ class Test_Optimize_Gpu_Model: self.run_optimize_gpu_model(Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound) + def test_chatglm2_gpu_model(self): + + Model = AutoModel + Tokenizer = AutoTokenizer + model_path = os.environ.get('CHATGLM2_6B_ORIGIN_PATH') + # currently only need to compare the output of one self-attention layer. + layer_norm = "transformer.encoder.layers.27.input_layernorm" + self_attn = "transformer.encoder.layers.27.self_attention" + lower_bound = 5e-5 + + self.run_optimize_gpu_model(Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound) + if __name__ == '__main__': pytest.main([__file__])