gsm8k OOM workaround (#9597)
* update bigdl_llm.py * update the installation of harness * fix partial function * import ipex * force seq len in decrease order * put func outside class * move comments * default 'trust_remote_code' as True * Update llm-harness-evaluation.yml
This commit is contained in:
parent
1ff4bc43a6
commit
972cdb9992
2 changed files with 35 additions and 111 deletions
8
.github/workflows/llm-harness-evaluation.yml
vendored
8
.github/workflows/llm-harness-evaluation.yml
vendored
|
|
@ -10,7 +10,6 @@ on:
|
|||
schedule:
|
||||
- cron: "00 13 * * 5" # GMT time, 13:00 GMT == 21:00 China
|
||||
pull_request:
|
||||
types: ready_for_review
|
||||
branches: [main]
|
||||
paths:
|
||||
- ".github/workflows/llm-harness-evaluation.yml"
|
||||
|
|
@ -139,10 +138,7 @@ jobs:
|
|||
working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness/
|
||||
shell: bash
|
||||
run: |
|
||||
git clone https://github.com/EleutherAI/lm-evaluation-harness.git
|
||||
cd lm-evaluation-harness
|
||||
git checkout e81d3cc
|
||||
pip install -e .
|
||||
pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@e81d3cc
|
||||
|
||||
- name: Download models and datasets
|
||||
shell: bash
|
||||
|
|
@ -226,4 +222,4 @@ jobs:
|
|||
shell: bash
|
||||
run: |
|
||||
ls results
|
||||
python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_results.py results
|
||||
python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_results.py results
|
||||
|
|
|
|||
|
|
@ -13,116 +13,44 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import os
|
||||
import multiprocessing
|
||||
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
|
||||
import torch
|
||||
from typing import Optional, Union
|
||||
from lm_eval.base import BaseLM
|
||||
import inspect
|
||||
from lm_eval.models.huggingface import AutoCausalLM
|
||||
from lm_eval import utils
|
||||
from functools import partial
|
||||
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
def _get_dtype(
|
||||
dtype: Union[str, torch.dtype]
|
||||
) -> torch.dtype:
|
||||
"""Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
|
||||
if isinstance(dtype, str) and dtype != "auto":
|
||||
# Convert `str` args torch dtype: `float16` -> `torch.float16`
|
||||
_torch_dtype = getattr(torch, dtype)
|
||||
else:
|
||||
_torch_dtype = dtype
|
||||
return _torch_dtype
|
||||
# wrap and force the Reorderer to be in a decrease order
|
||||
# This is a workaround to avoid frequent memory allocation which may cause OOM
|
||||
def force_decrease_order(Reorderer):
|
||||
def DecreaseReorderer(arr, fn):
|
||||
def _collate(x):
|
||||
len, tokens = fn(x)
|
||||
len = - abs(len)
|
||||
return len, tokens
|
||||
return Reorderer(arr, _collate)
|
||||
return DecreaseReorderer
|
||||
utils.Reorderer = force_decrease_order(utils.Reorderer)
|
||||
|
||||
class BigDLLM(BaseLM):
|
||||
def __init__(
|
||||
self,
|
||||
device="xpu",
|
||||
pretrained="gpt2",
|
||||
revision="main",
|
||||
low_cpu_mem_usage=None,
|
||||
subfolder=None,
|
||||
tokenizer=None,
|
||||
batch_size=1,
|
||||
load_in_8bit: Optional[bool] = False,
|
||||
trust_remote_code: Optional[bool] = True,
|
||||
load_in_low_bit=None,
|
||||
dtype: Optional[Union[str, torch.dtype]] = "auto",
|
||||
**kwargs
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
assert isinstance(pretrained, str)
|
||||
assert isinstance(batch_size, (int,str))
|
||||
if 'xpu' in device:
|
||||
import intel_extension_for_pytorch as ipex
|
||||
model = AutoModelForCausalLM.from_pretrained(pretrained,
|
||||
load_in_low_bit=load_in_low_bit,
|
||||
optimize_model=kwargs.get('optimize_model', True),
|
||||
trust_remote_code=trust_remote_code,
|
||||
use_cache=True,
|
||||
torch_dtype=_get_dtype(dtype))
|
||||
print(model) # print model to check precision
|
||||
self._device = device
|
||||
self.model = model.to(device)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
|
||||
|
||||
# setup for automatic batch size detection
|
||||
if batch_size == 'auto':
|
||||
self.batch_size_per_gpu = batch_size
|
||||
else:
|
||||
self.batch_size_per_gpu = int(batch_size)
|
||||
class BigDLLM(AutoCausalLM):
|
||||
AUTO_MODEL_CLASS = AutoModelForCausalLM
|
||||
AutoCausalLM_ARGS = inspect.getfullargspec(AutoCausalLM.__init__).args
|
||||
def __init__(self, *args, **kwargs):
|
||||
if 'device' in kwargs and 'xpu' in kwargs['device']:
|
||||
import intel_extension_for_pytorch
|
||||
self.bigdl_llm_kwargs = {}
|
||||
keys = list(kwargs.keys())
|
||||
for k in keys:
|
||||
if k not in self.AutoCausalLM_ARGS:
|
||||
self.bigdl_llm_kwargs[k] = kwargs[k]
|
||||
kwargs.pop(k)
|
||||
AutoModelForCausalLM.from_pretrained = partial(AutoModelForCausalLM.from_pretrained, **self.bigdl_llm_kwargs)
|
||||
kwargs['trust_remote_code'] = kwargs.get('trust_remote_code', True)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@property
|
||||
def eot_token_id(self):
|
||||
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
|
||||
return self.tokenizer.eos_token_id
|
||||
|
||||
@property
|
||||
def max_length(self):
|
||||
return 2048 # TODO: how to get this from config
|
||||
|
||||
@property
|
||||
def max_gen_toks(self):
|
||||
return 256
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
# TODO: fix multi-gpu
|
||||
return self.batch_size_per_gpu # * gpus
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
# TODO: fix multi-gpu
|
||||
return torch.device(self._device)
|
||||
|
||||
def tok_encode(self, string: str):
|
||||
input_ids = self.tokenizer.encode(string)
|
||||
return input_ids
|
||||
|
||||
def tok_decode(self, tokens):
|
||||
return self.tokenizer.decode(tokens, skip_special_tokens=True)
|
||||
|
||||
def _model_call(self, inps):
|
||||
"""
|
||||
inps: a torch tensor of shape [batch, sequence]
|
||||
the size of sequence may vary from call to call
|
||||
|
||||
returns: a torch tensor of shape [batch, sequence, vocab] with the
|
||||
logits returned from the model
|
||||
"""
|
||||
with torch.inference_mode():
|
||||
inps = inps.to(self.device)
|
||||
res = self.model(inps)[0]
|
||||
return res
|
||||
|
||||
def _model_generate(self, context, max_length, eos_token_id):
|
||||
generation_kwargs = {"do_sample": False, "max_length": max_length}
|
||||
if eos_token_id is not None:
|
||||
generation_kwargs["eos_token_id"] = eos_token_id
|
||||
generation_kwargs[
|
||||
"pad_token_id"
|
||||
] = eos_token_id # setting eos_token_id as pad token
|
||||
return self.model.generate(context, **generation_kwargs)
|
||||
def add_special_tokens(self) -> bool:
|
||||
return False
|
||||
|
|
|
|||
Loading…
Reference in a new issue