From 8bd2992a8d1d8f7e010a7e363e8c560724d074d4 Mon Sep 17 00:00:00 2001 From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com> Date: Mon, 5 Jun 2023 15:36:00 +0800 Subject: [PATCH] LLM: accelerate sample of gptneox and update quantize (#8262) * update quantize & accelerate sample * fix style check * fix style error --- .../llm/src/bigdl/llm/ggml/convert_model.py | 20 +++++++++- .../src/bigdl/llm/ggml/model/bloom/bloom.py | 4 +- .../bigdl/llm/ggml/model/bloom/bloom_cpp.py | 11 +++-- .../bigdl/llm/ggml/model/gptneox/gptneox.py | 40 +++++++++++-------- .../llm/ggml/model/gptneox/gptneox_cpp.py | 18 +++++++++ python/llm/src/bigdl/llm/ggml/quantize.py | 6 +-- 6 files changed, 70 insertions(+), 29 deletions(-) diff --git a/python/llm/src/bigdl/llm/ggml/convert_model.py b/python/llm/src/bigdl/llm/ggml/convert_model.py index 8547e1b5..637fbfea 100644 --- a/python/llm/src/bigdl/llm/ggml/convert_model.py +++ b/python/llm/src/bigdl/llm/ggml/convert_model.py @@ -1,3 +1,19 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + from bigdl.llm.ggml.convert import _convert_to_ggml from bigdl.llm.ggml.quantize import quantize from pathlib import Path @@ -32,10 +48,10 @@ def convert_model(input_path: str, outfile_dir=tmp_ggml_file_path, model_family=model_family, outtype="fp16") - + tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir()) quantize(input_path=tmp_ggml_file_path, output_path=output_path, model_family=model_family, - dtype=dtype) \ No newline at end of file + dtype=dtype) diff --git a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py b/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py index 52c883ee..ba903c1e 100644 --- a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py +++ b/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py @@ -46,6 +46,7 @@ # only search the first bigdl package and end up finding only one sub-package. from .bloom_cpp import bloom_load, bloom_free, bloom_run +from bigdl.llm.utils.common import invalidInputError class Bloom: @@ -81,8 +82,7 @@ class Bloom: A Bloom instance. """ self.ctx = bloom_load(bytes(model_path, encoding='utf-8'), n_ctx, n_threads) - if not self.ctx: - raise RuntimeError(f"Failed to load model from {model_path}") + invalidInputError(self.ctx is not None, f"Failed to load model from {model_path}") self.n_ctx = n_ctx self.seed = seed self.logits_all = logits_all diff --git a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom_cpp.py b/python/llm/src/bigdl/llm/ggml/model/bloom/bloom_cpp.py index f04a8529..9286ea8d 100644 --- a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom_cpp.py +++ b/python/llm/src/bigdl/llm/ggml/model/bloom/bloom_cpp.py @@ -62,6 +62,8 @@ from ctypes import ( ) import pathlib from bigdl.llm.utils import get_avx_flags +from bigdl.llm.utils.common import invalidInputError + # Load the library def _load_shared_library(lib_base_name: str): @@ -71,8 +73,8 @@ def _load_shared_library(lib_base_name: str): elif sys.platform == "win32": lib_ext = ".dll" else: - raise RuntimeError("Unsupported platform") - + invalidInputError(False, "Unsupported platform") + avx = get_avx_flags() # Construct the paths to the possible shared library names (python/llm/src/bigdl/llm/libs) @@ -101,9 +103,10 @@ def _load_shared_library(lib_base_name: str): try: return ctypes.CDLL(str(_lib_path)) except Exception as e: - raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") + invalidInputError(False, + f"Failed to load shared library '{_lib_path}': {e}") - raise FileNotFoundError(f"Shared library with base name '{lib_base_name}' not found") + invalidInputError(False, f"Shared library with base name '{lib_base_name}' not found") # Specify the base name of the shared library to load diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py b/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py index c4e7b0e9..39db380d 100644 --- a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py +++ b/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py @@ -51,6 +51,7 @@ import uuid import time import math import multiprocessing +import ctypes from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple from collections import deque, OrderedDict from bigdl.llm.utils.common import invalidInputError @@ -342,22 +343,29 @@ class Gptneox: "The attribute `eval_logits` of `Gptneox` object is None.") n_vocab = int(gptneox_cpp.gptneox_n_vocab(self.ctx)) logits = self.eval_logits[-1] - data = (gptneox_cpp.gptneox_token_data * n_vocab)( - *[ - gptneox_cpp.gptneox_token_data( - id=gptneox_cpp.gptneox_token(i), - logit=logits[i], - p=gptneox_cpp.c_float(0.0), - ) - for i in range(n_vocab) - ] - ) - size = gptneox_cpp.c_size_t(n_vocab) - sorted = False - candidates = gptneox_cpp.gptneox_token_data_array( - data=data, - size=size, - sorted=sorted, + # accelerate below code by moving to cpp + # data = (gptneox_cpp.gptneox_token_data * n_vocab)( + # *[ + # gptneox_cpp.gptneox_token_data( + # id=gptneox_cpp.gptneox_token(i), + # logit=logits[i], + # p=gptneox_cpp.c_float(0.0), + # ) + # for i in range(n_vocab) + # ] + # ) + # size = gptneox_cpp.c_size_t(n_vocab) + # sorted = False + # candidates = gptneox_cpp.gptneox_token_data_array( + # data=data, + # size=size, + # sorted=sorted, + # ) + logits = (ctypes.c_float * n_vocab)(*logits) + candidates = gptneox_cpp.gptneox_get_candidates( + ctx=self.ctx, + n_vocab=n_vocab, + logits=logits ) gptneox_cpp.gptneox_sample_repetition_penalty( ctx=self.ctx, diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_cpp.py b/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_cpp.py index b6ab13d8..08919db0 100644 --- a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_cpp.py +++ b/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_cpp.py @@ -529,6 +529,24 @@ _lib.gptneox_token_eos.restype = gptneox_token # Sampling functions +def gptneox_get_candidates( + ctx: gptneox_context_p, + n_vocab: c_int, + logits: c_float_p, +): + return _lib.gptneox_get_candidates( + ctx, n_vocab, logits + ) + + +_lib.gptneox_get_candidates.argtypes = [ + gptneox_context_p, + c_int, + c_float_p +] +_lib.gptneox_get_candidates.restype = gptneox_token_data_array + + # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, # with negative logit fix. def gptneox_sample_repetition_penalty( diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/bigdl/llm/ggml/quantize.py index baee0ce4..45a8b065 100644 --- a/python/llm/src/bigdl/llm/ggml/quantize.py +++ b/python/llm/src/bigdl/llm/ggml/quantize.py @@ -30,10 +30,9 @@ _llama_quantize_type = {"q4_0": 2, "q5_1": 9, "q8_0": 7} _bloom_quantize_type = {"q4_0": 2, - "q4_1": 3} + "q4_1": 3} _gptneox_quantize_type = {"q4_0": 2, "q4_1": 3, - "q4_2": 5, "q5_0": 8, "q5_1": 9, "q8_0": 7} @@ -42,9 +41,6 @@ _quantize_type = {"llama": _llama_quantize_type, "bloom": _bloom_quantize_type, "gptneox": _gptneox_quantize_type} -_valid_types = set(list(_llama_quantize_type.keys()) + list(_bloomz_quantize_type.keys()) + - list(_gptneox_quantize_type.keys())) - def quantize(input_path: str, output_path: str=None, model_family: str = 'llama', dtype: str='q4_0'):