gsm8k OOM workaround (#9597)

* update bigdl_llm.py

* update the installation of harness

* fix partial function

* import ipex

* force seq len in decrease order

* put func outside class

* move comments

* default 'trust_remote_code' as True

* Update llm-harness-evaluation.yml
This commit is contained in:
Chen, Zhentao 2023-12-08 18:47:25 +08:00 committed by GitHub
parent 1ff4bc43a6
commit 972cdb9992
2 changed files with 35 additions and 111 deletions

View file

@ -10,7 +10,6 @@ on:
schedule: schedule:
- cron: "00 13 * * 5" # GMT time, 13:00 GMT == 21:00 China - cron: "00 13 * * 5" # GMT time, 13:00 GMT == 21:00 China
pull_request: pull_request:
types: ready_for_review
branches: [main] branches: [main]
paths: paths:
- ".github/workflows/llm-harness-evaluation.yml" - ".github/workflows/llm-harness-evaluation.yml"
@ -139,10 +138,7 @@ jobs:
working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness/ working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness/
shell: bash shell: bash
run: | run: |
git clone https://github.com/EleutherAI/lm-evaluation-harness.git pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@e81d3cc
cd lm-evaluation-harness
git checkout e81d3cc
pip install -e .
- name: Download models and datasets - name: Download models and datasets
shell: bash shell: bash
@ -226,4 +222,4 @@ jobs:
shell: bash shell: bash
run: | run: |
ls results ls results
python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_results.py results python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_results.py results

View file

@ -13,116 +13,44 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import os
import multiprocessing
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from bigdl.llm.transformers import AutoModelForCausalLM
import torch import inspect
from typing import Optional, Union from lm_eval.models.huggingface import AutoCausalLM
from lm_eval.base import BaseLM from lm_eval import utils
from functools import partial
from transformers import AutoTokenizer, LlamaTokenizer
def _get_dtype( # wrap and force the Reorderer to be in a decrease order
dtype: Union[str, torch.dtype] # This is a workaround to avoid frequent memory allocation which may cause OOM
) -> torch.dtype: def force_decrease_order(Reorderer):
"""Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig""" def DecreaseReorderer(arr, fn):
if isinstance(dtype, str) and dtype != "auto": def _collate(x):
# Convert `str` args torch dtype: `float16` -> `torch.float16` len, tokens = fn(x)
_torch_dtype = getattr(torch, dtype) len = - abs(len)
else: return len, tokens
_torch_dtype = dtype return Reorderer(arr, _collate)
return _torch_dtype return DecreaseReorderer
utils.Reorderer = force_decrease_order(utils.Reorderer)
class BigDLLM(BaseLM):
def __init__(
self,
device="xpu",
pretrained="gpt2",
revision="main",
low_cpu_mem_usage=None,
subfolder=None,
tokenizer=None,
batch_size=1,
load_in_8bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = True,
load_in_low_bit=None,
dtype: Optional[Union[str, torch.dtype]] = "auto",
**kwargs
):
super().__init__()
assert isinstance(pretrained, str) class BigDLLM(AutoCausalLM):
assert isinstance(batch_size, (int,str)) AUTO_MODEL_CLASS = AutoModelForCausalLM
if 'xpu' in device: AutoCausalLM_ARGS = inspect.getfullargspec(AutoCausalLM.__init__).args
import intel_extension_for_pytorch as ipex def __init__(self, *args, **kwargs):
model = AutoModelForCausalLM.from_pretrained(pretrained, if 'device' in kwargs and 'xpu' in kwargs['device']:
load_in_low_bit=load_in_low_bit, import intel_extension_for_pytorch
optimize_model=kwargs.get('optimize_model', True), self.bigdl_llm_kwargs = {}
trust_remote_code=trust_remote_code, keys = list(kwargs.keys())
use_cache=True, for k in keys:
torch_dtype=_get_dtype(dtype)) if k not in self.AutoCausalLM_ARGS:
print(model) # print model to check precision self.bigdl_llm_kwargs[k] = kwargs[k]
self._device = device kwargs.pop(k)
self.model = model.to(device) AutoModelForCausalLM.from_pretrained = partial(AutoModelForCausalLM.from_pretrained, **self.bigdl_llm_kwargs)
kwargs['trust_remote_code'] = kwargs.get('trust_remote_code', True)
self.tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True) super().__init__(*args, **kwargs)
# setup for automatic batch size detection
if batch_size == 'auto':
self.batch_size_per_gpu = batch_size
else:
self.batch_size_per_gpu = int(batch_size)
@property @property
def eot_token_id(self): def add_special_tokens(self) -> bool:
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* return False
return self.tokenizer.eos_token_id
@property
def max_length(self):
return 2048 # TODO: how to get this from config
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
# TODO: fix multi-gpu
return self.batch_size_per_gpu # * gpus
@property
def device(self):
# TODO: fix multi-gpu
return torch.device(self._device)
def tok_encode(self, string: str):
input_ids = self.tokenizer.encode(string)
return input_ids
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens, skip_special_tokens=True)
def _model_call(self, inps):
"""
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model
"""
with torch.inference_mode():
inps = inps.to(self.device)
res = self.model(inps)[0]
return res
def _model_generate(self, context, max_length, eos_token_id):
generation_kwargs = {"do_sample": False, "max_length": max_length}
if eos_token_id is not None:
generation_kwargs["eos_token_id"] = eos_token_id
generation_kwargs[
"pad_token_id"
] = eos_token_id # setting eos_token_id as pad token
return self.model.generate(context, **generation_kwargs)