chatglm2 correctness test (#9450)

* chatglm2 ut * some update * chatglm2 path * fix * add print
2023-11-15 15:44:56 +08:00 · 2023-11-15 15:44:56 +08:00 · 170e0072af
commit 170e0072af
parent 24146d108f
1 changed files with 19 additions and 0 deletions
--- a/python/llm/test/inference_gpu/test_optimize_model.py
+++ b/python/llm/test/inference_gpu/test_optimize_model.py
@ -127,9 +127,16 @@ class Test_Optimize_Gpu_Model:
                else:
                    # 'past_key_value'is of type tuple as default.
                    for i, (t3, t4) in enumerate(zip(t1, t2)):
+                        if model.config.architectures[0] == "ChatGLMModel" and \
+                                hasattr(model.config, 'padded_vocab_size') and \
+                                model.config.padded_vocab_size == 65024:
+                            # chatglm2's past_key_value is expanded 16x for some speedup.
+                            # We need to narrow it here.
+                            t4 = t4[:, :, 15:17, :]
                        attn_output_diff.append(t3 - t4)

        max_diff_tensor = [torch.max(item).item() for item in attn_output_diff]
+        print(max_diff_tensor)
        assert all(max_diff <= lower_bound for max_diff in max_diff_tensor)


@ -158,6 +165,18 @@ class Test_Optimize_Gpu_Model:

        self.run_optimize_gpu_model(Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound)

+    def test_chatglm2_gpu_model(self):
+
+        Model = AutoModel
+        Tokenizer = AutoTokenizer
+        model_path = os.environ.get('CHATGLM2_6B_ORIGIN_PATH')
+        # currently only need to compare the output of one self-attention layer.
+        layer_norm = "transformer.encoder.layers.27.input_layernorm"
+        self_attn = "transformer.encoder.layers.27.self_attention"
+        lower_bound = 5e-5
+
+        self.run_optimize_gpu_model(Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound)
+

 if __name__ == '__main__':
    pytest.main([__file__])