diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py index 8ca27fac..6b581487 100644 --- a/lm_eval/models/__init__.py +++ b/lm_eval/models/__init__.py @@ -4,6 +4,7 @@ from . import anthropic_llms from . import huggingface from . import textsynth from . import dummy +from . import bigdl_llm MODEL_REGISTRY = { "hf": gpt2.HFLM, @@ -15,6 +16,7 @@ MODEL_REGISTRY = { "anthropic": anthropic_llms.AnthropicLM, "textsynth": textsynth.TextSynthLM, "dummy": dummy.DummyLM, + "bigdl-llm": bigdl_llm.BigDLLM } diff --git a/lm_eval/models/bigdl_llm.py b/lm_eval/models/bigdl_llm.py new file mode 100644 index 00000000..74010da3 --- /dev/null +++ b/lm_eval/models/bigdl_llm.py @@ -0,0 +1,124 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# this code is copied from llama2 example test, and added performance test +import os +import multiprocessing + +from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + +import torch +from typing import Optional, Union +from lm_eval.base import BaseLM + +from transformers import AutoTokenizer, LlamaTokenizer + +def _get_dtype( + dtype: Union[str, torch.dtype] +) -> torch.dtype: + """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig""" + if isinstance(dtype, str) and dtype != "auto": + # Convert `str` args torch dtype: `float16` -> `torch.float16` + _torch_dtype = getattr(torch, dtype) + else: + _torch_dtype = dtype + return _torch_dtype + +class BigDLLM(BaseLM): + def __init__( + self, + device="xpu", + pretrained="gpt2", + revision="main", + low_cpu_mem_usage=None, + subfolder=None, + tokenizer=None, + batch_size=1, + load_in_8bit: Optional[bool] = False, + trust_remote_code: Optional[bool] = False, + load_in_low_bit=None, + dtype: Optional[Union[str, torch.dtype]] = "auto", + ): + super().__init__() + + assert isinstance(pretrained, str) + assert isinstance(batch_size, (int,str)) + if 'xpu' in device: + import intel_extension_for_pytorch as ipex + model = AutoModelForCausalLM.from_pretrained(pretrained, + load_in_low_bit=load_in_low_bit, + optimize_model=True, + trust_remote_code=True, + use_cache=True, + torch_dtype=_get_dtype(dtype)) + print(model) # print model to check precision + self._device = device + self.model = model.to(device) + + self.tokenizer = LlamaTokenizer.from_pretrained(pretrained, trust_remote_code=True) + + # setup for automatic batch size detection + if batch_size == 'auto': + self.batch_size_per_gpu = batch_size + else: + self.batch_size_per_gpu = int(batch_size) + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.model.token_eos() + + @property + def max_length(self): + return 2048 # TODO: how to get this from config + + @property + def max_gen_toks(self): + return 256 + + @property + def batch_size(self): + # TODO: fix multi-gpu + return self.batch_size_per_gpu # * gpus + + @property + def device(self): + # TODO: fix multi-gpu + return torch.device(self._device) + + def tok_encode(self, string: str): + input_ids = self.tokenizer.encode(string) + return input_ids + + def tok_decode(self, tokens): + return self.tokenizer.decode(output[0], skip_special_tokens=True) + + def _model_call(self, inps): + """ + inps: a torch tensor of shape [batch, sequence] + the size of sequence may vary from call to call + + returns: a torch tensor of shape [batch, sequence, vocab] with the + logits returned from the model + """ + with torch.inference_mode(): + inps = inps.to(self.device) + res = self.model(inps)[0] + return res + + def _model_generate(self, context, max_length, eos_token_id): + return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True) \ No newline at end of file