[LLM] chatglm pybinding support (#8672)

This commit is contained in:
Yishuo Wang 2023-08-04 14:27:29 +08:00 committed by GitHub
parent 5837cc424a
commit ef08250c21
4 changed files with 470 additions and 2 deletions

View file

@ -80,7 +80,7 @@ windows_binarys = [
"quantize-gptneox_vnni.exe",
"quantize-bloom_vnni.exe",
"quantize-starcoder_vnni.exe",
"main-chatglm_vnni.exe",
]
linux_binarys = [
@ -112,7 +112,7 @@ linux_binarys = [
"main-gptneox",
"main-bloom",
"main-starcoder",
"main-chatglm_vnni",
]
@ -220,6 +220,7 @@ def setup_package():
print(f"Deleting existing libs_dir {libs_dir} ....")
shutil.rmtree(libs_dir)
os.makedirs(libs_dir, exist_ok=True)
open(os.path.join(libs_dir, "__init__.py"), 'w').close()
# copy built files for github workflow
for built_file in glob.glob(os.path.join(github_artifact_dir, '*')):

View file

@ -0,0 +1,20 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This would makes sure Python is aware there is more than one sub-package within bigdl,
# physically located elsewhere.
# Otherwise there would be module not found error in non-pip's setting as Python would
# only search the first bigdl package and end up finding only one sub-package.

View file

@ -0,0 +1,373 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ===========================================================================
#
# This file is adapted from
# https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py
#
# MIT License
#
# Copyright (c) 2023 Andrei Betlen
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# This would makes sure Python is aware there is more than one sub-package within bigdl,
# physically located elsewhere.
# Otherwise there would be module not found error in non-pip's setting as Python would
# only search the first bigdl package and end up finding only one sub-package.
from .chatglm_cpp import chatglm_load, chatglm_tokenize, chatglm_detokenize, chatglm_eval, \
chatglm_eos_token
from bigdl.llm.utils.common import invalidInputError
from bigdl.llm.ggml.model.generation import GenerationMixin
from typing import List, Optional, Generator, Sequence, Union
import time
import uuid
import warnings
class ChatGLM:
"""High-level Python wrapper for a chatglm.cpp model."""
def __init__(
self,
model_path: str,
n_ctx: int = 512,
n_parts: int = -1,
n_gpu_layers: int = 0,
seed: int = -1,
f16_kv: bool = True,
logits_all: bool = False,
vocab_only: bool = False,
use_mmap: bool = False,
use_mlock: bool = False,
embedding: bool = False,
n_threads: Optional[int] = 2,
n_batch: int = 512,
last_n_tokens_size: int = 64,
lora_base: Optional[str] = None,
lora_path: Optional[str] = None,
verbose: bool = True,
):
"""Load a chatglm.cpp model from `model_path`.
Args:
model_path: Path to the model.
n_ctx: Maximum context size.
n_parts: Number of parts to split the model into. If -1, the number of parts
is automatically determined.
seed: Random seed. For default value -1, current timestamp is used as seed.
f16_kv: Use half-precision for key/value cache.
logits_all: Return logits for all tokens, not just the last token.
vocab_only: Only load the vocabulary no weights.
use_mmap: Use mmap if possible.
use_mlock: Force the system to keep the model in RAM.
embedding: Embedding mode only.
n_threads: Number of threads to use. Default to be 2.
n_batch: Maximum number of prompt tokens to batch together when calling chatglm_eval.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and
you want to apply LoRA to an f16 model.
lora_path: Path to a LoRA file to apply to the model.
verbose: Print verbose output to stderr.
Raises:
ValueError: If the model path does not exist.
Returns:
A ChatGLM instance.
"""
self.model_path = model_path
self.ctx = chatglm_load(model_path, use_mmap=use_mmap, n_ctx=n_ctx, n_threads=n_threads)
self.n_ctx = n_ctx
self.n_parts = n_parts
self.n_gpu_layers = n_gpu_layers
self.f16_kv = f16_kv
self.seed = seed
self.logits_all = logits_all
self.vocab_only = vocab_only
self.use_mmap = use_mmap
self.use_mlock = use_mlock
self.embedding = embedding
self.n_threads = n_threads
self.n_batch = n_batch
self.last_n_tokens_size = last_n_tokens_size
self.lora_base = lora_base
self.lora_path = lora_path
self.verbose = verbose
# TODO: Some parameters are temporarily not supported
unsupported_arg = {'n_parts': -1, 'n_gpu_layers': 0, 'f16_kv': True, 'logits_all': False,
'vocab_only': False, 'use_mlock': False, 'embedding': False,
'n_batch': 512, 'last_n_tokens_size': 64, 'lora_base': None,
'lora_path': None, 'verbose': True}
for arg in unsupported_arg.keys():
if getattr(self, arg) != unsupported_arg[arg]:
warnings.warn(f"The parameter {arg} is temporarily unsupported, "
"please use the default value.")
def __call__(
self,
prompt: str,
suffix: Optional[str] = None,
max_tokens: int = 128,
temperature: float = 0.95,
top_p: float = 0.7,
logprobs: Optional[int] = None,
echo: bool = False,
stop: Optional[Union[str, List[str]]]=[],
frequency_penalty: float = 0.0,
presence_penalty: float = 0.0,
repeat_penalty: float = 1.1,
top_k: int = 0,
stream: bool = False,
tfs_z: float = 1.0,
mirostat_mode: int = 0,
mirostat_tau: float = 5.0,
mirostat_eta: float = 0.1,
model: Optional[str] = None,
):
# TODO: Some parameters are temporarily not supported
# Unsupported parameters are checked in `_supported_call`
return self._supported_call(prompt, max_tokens, stream, temperature, top_p, top_k,
stop, model, suffix, logprobs, echo, frequency_penalty,
presence_penalty, repeat_penalty, tfs_z, mirostat_mode,
mirostat_tau, mirostat_eta)
def _supported_call(self, prompt: str, max_tokens: int, stream: bool,
temperature: float, top_p: float, top_k: int,
stop: Optional[List[str]] = [], model: Optional[str] = None, *args):
# Check unsupporeted parameters
unsupported_arg = ['suffix', 'logprobs', 'echo',
'frequency_penalty', 'presence_penalty', 'repeat_penalty',
'tfs_z', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'model']
defult_value = {'suffix': None, 'logprobs': None, 'echo': False,
'frequency_penalty': 0.0, 'presence_penalty': 0.0,
'repeat_penalty': 1.1, 'tfs_z': 1.0, 'mirostat_mode': 0,
'mirostat_tau': 5.0, 'mirostat_eta': 0.1}
for index in range(len(args)):
if args[index] != defult_value[unsupported_arg[index]]:
warnings.warn(f"The parameter {unsupported_arg[index]} is temporarily "
"unsupported, please use the default value.")
if stream:
return self.stream(prompt, max_tokens, temperature, top_p, top_k, stop, model)
else:
return self._eval(prompt, max_tokens, temperature, top_p, top_k, stop, model)
def _eval(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int,
stop: Optional[List[str]] = [], model: Optional[str] = None):
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
created: int = int(time.time())
if model is None:
model_name = self.model_path
else:
model_name = model
input_tokens = self._tokenize(prompt)
prompt_len = len(input_tokens)
if max_tokens < 1:
return {
"id": completion_id,
"object": "text_completion",
"created": created,
"model": model_name,
"choices": [
{
"text": prompt,
"index": 0,
"logprobs": None,
"finish_reason": "length",
}
],
"usage":
{
"prompt_tokens": prompt_len,
"completion_tokens": 0,
"total_tokens": prompt_len,
}
}
n_past = 0
output_tokens = []
for i in range(max_tokens):
token = self.forward(input_ids=input_tokens,
n_past=n_past,
top_k=top_k,
top_p=top_p,
temperature=temperature)
output_tokens.append(token)
n_past += len(input_tokens)
input_tokens = [token]
if token == self.eos_token():
break
text = self.detokenize(output_tokens)
split_text = text
if stop != []:
for stop_word in stop:
split_text = split_text.split(stop_word)[0]
if split_text != text:
finish_reason = "stop"
else:
finish_reason = None
completion_len = n_past - prompt_len
return {
"id": completion_id,
"object": "text_completion",
"created": created,
"model": model_name,
"choices": [
{
"text": prompt + split_text,
"index": 0,
"logprobs": None,
"finish_reason": finish_reason,
}
],
"usage": {
"prompt_tokens": prompt_len,
"completion_tokens": completion_len,
"total_tokens": prompt_len + completion_len,
}
}
def stream(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int,
stop: Optional[List[str]] = [], model: Optional[str] = None):
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
created: int = int(time.time())
if model is None:
model_name = self.model_path
else:
model_name = model
input_tokens = self._tokenize(prompt)
prompt_len = len(input_tokens)
if max_tokens < 1:
yield {
"id": completion_id,
"object": "text_completion",
"created": created,
"model": model_name,
"choices": [
{
"text": prompt,
"index": 0,
"logprobs": None,
"finish_reason": "length",
}
],
"usage":
{
"prompt_tokens": prompt_len
}
}
else:
n_past = 0
output_tokens = []
for i in range(max_tokens):
token = self.forward(input_ids=input_tokens,
n_past=n_past,
top_k=top_k,
top_p=top_p,
temperature=temperature)
output_tokens.append(token)
n_past += len(input_tokens)
input_tokens = [token]
if token == self.eos_token():
print('\n')
break
text = self.detokenize(token)
yield {
"id": completion_id,
"object": "text_completion",
"created": created,
"model": model_name,
"choices": [
{
"text": text,
"index": 0,
"logprobs": None,
"finish_reason": None,
}
],
"usage":
{
"prompt_tokens": prompt_len
}
}
def _tokenize(self, text: bytes) -> List[int]:
"""Tokenize a string.
Args:
text: The utf-8 encoded string to tokenize.
Raises:
RuntimeError: If the tokenization failed.
Returns:
A list of tokens.
"""
return chatglm_tokenize(self.ctx, text)
def detokenize(self, tokens: List[int]) -> bytes:
"""Detokenize a list of tokens.
Args:
tokens: The list of tokens to detokenize.
Returns:
The detokenized string.
"""
if isinstance(tokens, int):
tokens = [tokens]
return chatglm_detokenize(self.ctx, tokens)
def forward(self,
input_ids: List[int],
n_past: int,
do_sample: bool = True,
top_k: int = 0,
top_p: float = 0.7,
temperature: float = 0.95,) -> int:
return chatglm_eval(ctx=self.ctx,
input_ids=input_ids,
n_past=n_past,
do_sample=do_sample,
top_k=top_k,
top_p=top_p,
temperature=temperature)
def eos_token(self) -> int:
return chatglm_eos_token(self.ctx)

View file

@ -0,0 +1,74 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This would makes sure Python is aware there is more than one sub-package within bigdl,
# physically located elsewhere.
# Otherwise there would be module not found error in non-pip's setting as Python would
# only search the first bigdl package and end up finding only one sub-package.
from typing import List
from pathlib import Path
from bigdl.llm.libs.chatglm_C import Pipeline, GenerationConfig
class ChatGLMContext:
def __init__(self, pipeline: Pipeline, config: GenerationConfig):
self.pipeline = pipeline
self.config = config
def chatglm_load(path: str,
n_ctx: int,
n_threads: int,
use_mmap: bool = False,
) -> ChatGLMContext:
path = str(Path(path))
pipeline = Pipeline(path, use_mmap)
config = GenerationConfig(
max_context_length=n_ctx,
num_threads=n_threads,
)
return ChatGLMContext(pipeline, config)
def chatglm_tokenize(ctx: ChatGLMContext, prompt: str) -> List[int]:
return ctx.pipeline.tokenizer.encode(prompt)
def chatglm_detokenize(ctx: ChatGLMContext, input_ids: List[int]) -> str:
return ctx.pipeline.tokenizer.decode(input_ids)
def chatglm_eval(ctx: ChatGLMContext,
input_ids: List[int],
n_past: int,
do_sample: bool = True,
top_k: int = 0,
top_p: float = 0.7,
temperature: float = 0.95,
) -> int:
ctx.config.do_sample = do_sample
ctx.config.top_k = top_k
ctx.config.top_p = top_p
ctx.temperature = temperature
return ctx.pipeline.model.generate_next_token(input_ids, ctx.config, n_past,
ctx.config.max_context_length)
def chatglm_eos_token(ctx: ChatGLMContext):
return ctx.pipeline.model.config.eos_token_id