[LLM] Add model correctness test on ARC for llama and falcon (#9347)

* add correctness test on arc for llama model

* modify layer name

* add falcon ut

* refactor and add ut for falcon model

* modify lambda positions and update docs

* replace loading pre input with last decodelayer output

* switch lower bound to single model instead of using the common one

* make the code implementation simple

* fix gpu action allocation memory issue
This commit is contained in:
SONG Ge 2023-11-10 13:48:57 +08:00 committed by GitHub
parent 3d107f6d25
commit dfb00e37e9
2 changed files with 106 additions and 4 deletions

View file

@ -14,11 +14,13 @@
# limitations under the License. # limitations under the License.
# #
import pytest
import os import os
import pytest
from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel import torch
from transformers import LlamaTokenizer, AutoTokenizer from transformers import LlamaTokenizer, AutoTokenizer
from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
device = os.environ['DEVICE'] device = os.environ['DEVICE']
print(f'Running on {device}') print(f'Running on {device}')
@ -29,7 +31,7 @@ prompt = "Once upon a time, there existed a little girl who liked to have advent
@pytest.mark.parametrize('Model, Tokenizer, model_path',[ @pytest.mark.parametrize('Model, Tokenizer, model_path',[
(AutoModelForCausalLM, AutoTokenizer, os.environ.get('MPT_7B_ORIGIN_PATH')), (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MPT_7B_ORIGIN_PATH')),
(AutoModelForCausalLM, AutoTokenizer, os.environ.get('FALCON_7B_ORIGIN_PATH')), (AutoModelForCausalLM, AutoTokenizer, os.environ.get('FALCON_7B_ORIGIN_PATH'))
]) ])
def test_optimize_model(Model, Tokenizer, model_path): def test_optimize_model(Model, Tokenizer, model_path):
tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
@ -55,6 +57,107 @@ def test_optimize_model(Model, Tokenizer, model_path):
assert any(diff) is False assert any(diff) is False
class Test_Optimize_Gpu_Model:
def setup(self):
self.layer_outputs = []
self.pre_layer_outputs = []
def run_optimize_gpu_model(self, Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound):
def forward_hook(module, input, output, layer_name):
self.layer_outputs.append(output)
def pre_forward_hook(module, input, output, layer_name):
self.pre_layer_outputs.append(output)
tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
model = Model.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=False,
trust_remote_code=True)
model = model.to(device)
for layer_name, layer_module in model.named_modules():
if layer_name == layer_norm:
layer_module.register_forward_hook(
lambda module, input, output, layer_name=layer_name: pre_forward_hook(module, input,
output, layer_name))
if layer_name == self_attn:
layer_module.register_forward_hook(
lambda module, input, output, layer_name=layer_name: forward_hook(module, input,
output, layer_name))
logits_base_model = (model(input_ids)).logits
# the list `layer_output` has only one element.
layer_tensor = self.layer_outputs.pop()
model.to('cpu')
opt_model = Model.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=True,
trust_remote_code=True)
opt_model = opt_model.to(device)
def replace_forward_hook(module, input, output, layer_name):
output = self.pre_layer_outputs[0]
return output
for layer_name, layer_module in opt_model.named_modules():
if layer_name == layer_norm:
layer_module.register_forward_hook(
lambda module, input, output, layer_name=layer_name: replace_forward_hook(module, input,
output, layer_name))
if layer_name == self_attn:
layer_module.register_forward_hook(
lambda module, input, output, layer_name=layer_name: forward_hook(module, input,
output, layer_name))
logits_optimized_model = (opt_model(input_ids)).logits
# the list `layer_output` has only one element.
opt_layer_tensor = self.layer_outputs[0]
opt_model.to('cpu')
attn_output_diff = []
for i, (t1, t2) in enumerate(zip(layer_tensor, opt_layer_tensor)):
if t1 is not None and t2 is not None:
if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
# 'attn_output' is of type torch.Tensor.
attn_output_diff.append(t1 - t2)
else:
# 'past_key_value'is of type tuple as default.
for i, (t3, t4) in enumerate(zip(t1, t2)):
attn_output_diff.append(t3 - t4)
max_diff_tensor = [torch.max(item).item() for item in attn_output_diff]
assert all(max_diff <= lower_bound for max_diff in max_diff_tensor)
def test_falcon_gpu_model(self):
Model = AutoModelForCausalLM
Tokenizer = AutoTokenizer
model_path = os.environ.get('FALCON_7B_ORIGIN_PATH')
# currently only compare the output of the last self-attention layer.
layer_norm = "transformer.h.31.input_layernorm"
self_attn = "transformer.h.31.self_attention"
lower_bound = 0
self.run_optimize_gpu_model(Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound)
def test_llama_gpu_model(self):
Model = AutoModelForCausalLM
Tokenizer = AutoTokenizer
model_path = os.environ.get('LLAMA2_7B_ORIGIN_PATH')
# currently only compare the output of the last self-attention layer.
layer_norm = "model.layers.31.input_layernorm"
self_attn = "model.layers.31.self_attn"
lower_bound = 5e-2
self.run_optimize_gpu_model(Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound)
if __name__ == '__main__': if __name__ == '__main__':
pytest.main([__file__]) pytest.main([__file__])

View file

@ -5,7 +5,6 @@ export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference_gpu export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference_gpu
export USE_XETLA=OFF export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export DEVICE='xpu' export DEVICE='xpu'
set -e set -e