From 170e0072afe580b8b65c5f916c1e49174c4c02c3 Mon Sep 17 00:00:00 2001
From: Xin Qiu <qiuxin2012@users.noreply.github.com>
Date: Wed, 15 Nov 2023 15:44:56 +0800
Subject: [PATCH] chatglm2 correctness test (#9450)

* chatglm2 ut

* some update

* chatglm2 path

* fix

* add print
---
 .../test/inference_gpu/test_optimize_model.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/python/llm/test/inference_gpu/test_optimize_model.py b/python/llm/test/inference_gpu/test_optimize_model.py
index a63b5c5a..581948de 100644
--- a/python/llm/test/inference_gpu/test_optimize_model.py
+++ b/python/llm/test/inference_gpu/test_optimize_model.py
@@ -127,9 +127,16 @@ class Test_Optimize_Gpu_Model:
                 else:
                     # 'past_key_value'is of type tuple as default.
                     for i, (t3, t4) in enumerate(zip(t1, t2)):
+                        if model.config.architectures[0] == "ChatGLMModel" and \
+                                hasattr(model.config, 'padded_vocab_size') and \
+                                model.config.padded_vocab_size == 65024:
+                            # chatglm2's past_key_value is expanded 16x for some speedup.
+                            # We need to narrow it here.
+                            t4 = t4[:, :, 15:17, :]
                         attn_output_diff.append(t3 - t4)
 
         max_diff_tensor = [torch.max(item).item() for item in attn_output_diff]
+        print(max_diff_tensor)
         assert all(max_diff <= lower_bound for max_diff in max_diff_tensor)
 
 
@@ -158,6 +165,18 @@ class Test_Optimize_Gpu_Model:
 
         self.run_optimize_gpu_model(Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound)
 
+    def test_chatglm2_gpu_model(self):
+
+        Model = AutoModel
+        Tokenizer = AutoTokenizer
+        model_path = os.environ.get('CHATGLM2_6B_ORIGIN_PATH')
+        # currently only need to compare the output of one self-attention layer.
+        layer_norm = "transformer.encoder.layers.27.input_layernorm"
+        self_attn = "transformer.encoder.layers.27.self_attention"
+        lower_bound = 5e-5
+
+        self.run_optimize_gpu_model(Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound)
+
 
 if __name__ == '__main__':
     pytest.main([__file__])