chatglm2 correctness test (#9450)

* chatglm2 ut * some update * chatglm2 path * fix * add print
2023-11-15 15:44:56 +08:00 · 2023-11-15 15:44:56 +08:00 · 170e0072af
commit 170e0072af
parent 24146d108f
1 changed files with 19 additions and 0 deletions
--- a/python/llm/test/inference_gpu/test_optimize_model.py
+++ b/python/llm/test/inference_gpu/test_optimize_model.py
@ -127,9 +127,16 @@ class Test_Optimize_Gpu_Model:
                else:
                    # 'past_key_value'is of type tuple as default.
                    for i, (t3, t4) in enumerate(zip(t1, t2)):
                        if model.config.architectures[0] == "ChatGLMModel" and \
                                hasattr(model.config, 'padded_vocab_size') and \
                                model.config.padded_vocab_size == 65024:
                            # chatglm2's past_key_value is expanded 16x for some speedup.
                            # We need to narrow it here.
                            t4 = t4[:, :, 15:17, :]
                        attn_output_diff.append(t3 - t4)
        max_diff_tensor = [torch.max(item).item() for item in attn_output_diff]
        print(max_diff_tensor)
        assert all(max_diff <= lower_bound for max_diff in max_diff_tensor)
@ -158,6 +165,18 @@ class Test_Optimize_Gpu_Model:
        self.run_optimize_gpu_model(Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound)
    def test_chatglm2_gpu_model(self):
        Model = AutoModel
        Tokenizer = AutoTokenizer
        model_path = os.environ.get('CHATGLM2_6B_ORIGIN_PATH')
        # currently only need to compare the output of one self-attention layer.
        layer_norm = "transformer.encoder.layers.27.input_layernorm"
        self_attn = "transformer.encoder.layers.27.self_attention"
        lower_bound = 5e-5
        self.run_optimize_gpu_model(Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound)
 if __name__ == '__main__':
    pytest.main([__file__])