From ef08250c2125e99b4dd65ac69db884aaa61317a3 Mon Sep 17 00:00:00 2001 From: Yishuo Wang Date: Fri, 4 Aug 2023 14:27:29 +0800 Subject: [PATCH] [LLM] chatglm pybinding support (#8672) --- python/llm/setup.py | 5 +- .../bigdl/llm/ggml/model/chatglm/__init__.py | 20 + .../bigdl/llm/ggml/model/chatglm/chatglm.py | 373 ++++++++++++++++++ .../llm/ggml/model/chatglm/chatglm_cpp.py | 74 ++++ 4 files changed, 470 insertions(+), 2 deletions(-) create mode 100644 python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py create mode 100644 python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py create mode 100644 python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py diff --git a/python/llm/setup.py b/python/llm/setup.py index 4591d22e..16187aff 100644 --- a/python/llm/setup.py +++ b/python/llm/setup.py @@ -80,7 +80,7 @@ windows_binarys = [ "quantize-gptneox_vnni.exe", "quantize-bloom_vnni.exe", "quantize-starcoder_vnni.exe", - + "main-chatglm_vnni.exe", ] linux_binarys = [ @@ -112,7 +112,7 @@ linux_binarys = [ "main-gptneox", "main-bloom", "main-starcoder", - + "main-chatglm_vnni", ] @@ -220,6 +220,7 @@ def setup_package(): print(f"Deleting existing libs_dir {libs_dir} ....") shutil.rmtree(libs_dir) os.makedirs(libs_dir, exist_ok=True) + open(os.path.join(libs_dir, "__init__.py"), 'w').close() # copy built files for github workflow for built_file in glob.glob(os.path.join(github_artifact_dir, '*')): diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py b/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py new file mode 100644 index 00000000..dbdafd2a --- /dev/null +++ b/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py @@ -0,0 +1,20 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This would makes sure Python is aware there is more than one sub-package within bigdl, +# physically located elsewhere. +# Otherwise there would be module not found error in non-pip's setting as Python would +# only search the first bigdl package and end up finding only one sub-package. diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py new file mode 100644 index 00000000..66be18cd --- /dev/null +++ b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py @@ -0,0 +1,373 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# =========================================================================== +# +# This file is adapted from +# https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py +# +# MIT License +# +# Copyright (c) 2023 Andrei Betlen +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# This would makes sure Python is aware there is more than one sub-package within bigdl, +# physically located elsewhere. +# Otherwise there would be module not found error in non-pip's setting as Python would +# only search the first bigdl package and end up finding only one sub-package. + + +from .chatglm_cpp import chatglm_load, chatglm_tokenize, chatglm_detokenize, chatglm_eval, \ + chatglm_eos_token +from bigdl.llm.utils.common import invalidInputError +from bigdl.llm.ggml.model.generation import GenerationMixin +from typing import List, Optional, Generator, Sequence, Union +import time +import uuid +import warnings + + +class ChatGLM: + """High-level Python wrapper for a chatglm.cpp model.""" + + def __init__( + self, + model_path: str, + n_ctx: int = 512, + n_parts: int = -1, + n_gpu_layers: int = 0, + seed: int = -1, + f16_kv: bool = True, + logits_all: bool = False, + vocab_only: bool = False, + use_mmap: bool = False, + use_mlock: bool = False, + embedding: bool = False, + n_threads: Optional[int] = 2, + n_batch: int = 512, + last_n_tokens_size: int = 64, + lora_base: Optional[str] = None, + lora_path: Optional[str] = None, + verbose: bool = True, + ): + """Load a chatglm.cpp model from `model_path`. + + Args: + model_path: Path to the model. + n_ctx: Maximum context size. + n_parts: Number of parts to split the model into. If -1, the number of parts + is automatically determined. + seed: Random seed. For default value -1, current timestamp is used as seed. + f16_kv: Use half-precision for key/value cache. + logits_all: Return logits for all tokens, not just the last token. + vocab_only: Only load the vocabulary no weights. + use_mmap: Use mmap if possible. + use_mlock: Force the system to keep the model in RAM. + embedding: Embedding mode only. + n_threads: Number of threads to use. Default to be 2. + n_batch: Maximum number of prompt tokens to batch together when calling chatglm_eval. + last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. + lora_base: Optional path to base model, useful if using a quantized base model and + you want to apply LoRA to an f16 model. + lora_path: Path to a LoRA file to apply to the model. + verbose: Print verbose output to stderr. + + Raises: + ValueError: If the model path does not exist. + + Returns: + A ChatGLM instance. + """ + + self.model_path = model_path + self.ctx = chatglm_load(model_path, use_mmap=use_mmap, n_ctx=n_ctx, n_threads=n_threads) + self.n_ctx = n_ctx + self.n_parts = n_parts + self.n_gpu_layers = n_gpu_layers + self.f16_kv = f16_kv + self.seed = seed + self.logits_all = logits_all + self.vocab_only = vocab_only + self.use_mmap = use_mmap + self.use_mlock = use_mlock + self.embedding = embedding + self.n_threads = n_threads + self.n_batch = n_batch + self.last_n_tokens_size = last_n_tokens_size + self.lora_base = lora_base + self.lora_path = lora_path + self.verbose = verbose + # TODO: Some parameters are temporarily not supported + unsupported_arg = {'n_parts': -1, 'n_gpu_layers': 0, 'f16_kv': True, 'logits_all': False, + 'vocab_only': False, 'use_mlock': False, 'embedding': False, + 'n_batch': 512, 'last_n_tokens_size': 64, 'lora_base': None, + 'lora_path': None, 'verbose': True} + for arg in unsupported_arg.keys(): + if getattr(self, arg) != unsupported_arg[arg]: + warnings.warn(f"The parameter {arg} is temporarily unsupported, " + "please use the default value.") + + def __call__( + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 128, + temperature: float = 0.95, + top_p: float = 0.7, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]]=[], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 0, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + ): + # TODO: Some parameters are temporarily not supported + # Unsupported parameters are checked in `_supported_call` + return self._supported_call(prompt, max_tokens, stream, temperature, top_p, top_k, + stop, model, suffix, logprobs, echo, frequency_penalty, + presence_penalty, repeat_penalty, tfs_z, mirostat_mode, + mirostat_tau, mirostat_eta) + + def _supported_call(self, prompt: str, max_tokens: int, stream: bool, + temperature: float, top_p: float, top_k: int, + stop: Optional[List[str]] = [], model: Optional[str] = None, *args): + # Check unsupporeted parameters + unsupported_arg = ['suffix', 'logprobs', 'echo', + 'frequency_penalty', 'presence_penalty', 'repeat_penalty', + 'tfs_z', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'model'] + defult_value = {'suffix': None, 'logprobs': None, 'echo': False, + 'frequency_penalty': 0.0, 'presence_penalty': 0.0, + 'repeat_penalty': 1.1, 'tfs_z': 1.0, 'mirostat_mode': 0, + 'mirostat_tau': 5.0, 'mirostat_eta': 0.1} + for index in range(len(args)): + if args[index] != defult_value[unsupported_arg[index]]: + warnings.warn(f"The parameter {unsupported_arg[index]} is temporarily " + "unsupported, please use the default value.") + + if stream: + return self.stream(prompt, max_tokens, temperature, top_p, top_k, stop, model) + else: + return self._eval(prompt, max_tokens, temperature, top_p, top_k, stop, model) + + def _eval(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int, + stop: Optional[List[str]] = [], model: Optional[str] = None): + + completion_id: str = f"cmpl-{str(uuid.uuid4())}" + created: int = int(time.time()) + if model is None: + model_name = self.model_path + else: + model_name = model + + input_tokens = self._tokenize(prompt) + prompt_len = len(input_tokens) + if max_tokens < 1: + return { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": prompt, + "index": 0, + "logprobs": None, + "finish_reason": "length", + } + ], + "usage": + { + "prompt_tokens": prompt_len, + "completion_tokens": 0, + "total_tokens": prompt_len, + } + } + + n_past = 0 + output_tokens = [] + for i in range(max_tokens): + token = self.forward(input_ids=input_tokens, + n_past=n_past, + top_k=top_k, + top_p=top_p, + temperature=temperature) + output_tokens.append(token) + n_past += len(input_tokens) + input_tokens = [token] + if token == self.eos_token(): + break + + text = self.detokenize(output_tokens) + split_text = text + if stop != []: + for stop_word in stop: + split_text = split_text.split(stop_word)[0] + if split_text != text: + finish_reason = "stop" + else: + finish_reason = None + completion_len = n_past - prompt_len + return { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": prompt + split_text, + "index": 0, + "logprobs": None, + "finish_reason": finish_reason, + } + ], + "usage": { + "prompt_tokens": prompt_len, + "completion_tokens": completion_len, + "total_tokens": prompt_len + completion_len, + } + } + + def stream(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int, + stop: Optional[List[str]] = [], model: Optional[str] = None): + completion_id: str = f"cmpl-{str(uuid.uuid4())}" + created: int = int(time.time()) + if model is None: + model_name = self.model_path + else: + model_name = model + input_tokens = self._tokenize(prompt) + prompt_len = len(input_tokens) + if max_tokens < 1: + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": prompt, + "index": 0, + "logprobs": None, + "finish_reason": "length", + } + ], + "usage": + { + "prompt_tokens": prompt_len + } + } + else: + n_past = 0 + output_tokens = [] + for i in range(max_tokens): + token = self.forward(input_ids=input_tokens, + n_past=n_past, + top_k=top_k, + top_p=top_p, + temperature=temperature) + output_tokens.append(token) + n_past += len(input_tokens) + input_tokens = [token] + if token == self.eos_token(): + print('\n') + break + text = self.detokenize(token) + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": text, + "index": 0, + "logprobs": None, + "finish_reason": None, + } + ], + "usage": + { + "prompt_tokens": prompt_len + } + } + + def _tokenize(self, text: bytes) -> List[int]: + """Tokenize a string. + + Args: + text: The utf-8 encoded string to tokenize. + + Raises: + RuntimeError: If the tokenization failed. + + Returns: + A list of tokens. + """ + return chatglm_tokenize(self.ctx, text) + + def detokenize(self, tokens: List[int]) -> bytes: + """Detokenize a list of tokens. + + Args: + tokens: The list of tokens to detokenize. + + Returns: + The detokenized string. + """ + if isinstance(tokens, int): + tokens = [tokens] + return chatglm_detokenize(self.ctx, tokens) + + def forward(self, + input_ids: List[int], + n_past: int, + do_sample: bool = True, + top_k: int = 0, + top_p: float = 0.7, + temperature: float = 0.95,) -> int: + return chatglm_eval(ctx=self.ctx, + input_ids=input_ids, + n_past=n_past, + do_sample=do_sample, + top_k=top_k, + top_p=top_p, + temperature=temperature) + + def eos_token(self) -> int: + return chatglm_eos_token(self.ctx) diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py new file mode 100644 index 00000000..12b1a45e --- /dev/null +++ b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py @@ -0,0 +1,74 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This would makes sure Python is aware there is more than one sub-package within bigdl, +# physically located elsewhere. +# Otherwise there would be module not found error in non-pip's setting as Python would +# only search the first bigdl package and end up finding only one sub-package. + + +from typing import List +from pathlib import Path + +from bigdl.llm.libs.chatglm_C import Pipeline, GenerationConfig + + +class ChatGLMContext: + def __init__(self, pipeline: Pipeline, config: GenerationConfig): + self.pipeline = pipeline + self.config = config + + +def chatglm_load(path: str, + n_ctx: int, + n_threads: int, + use_mmap: bool = False, + ) -> ChatGLMContext: + path = str(Path(path)) + pipeline = Pipeline(path, use_mmap) + config = GenerationConfig( + max_context_length=n_ctx, + num_threads=n_threads, + ) + return ChatGLMContext(pipeline, config) + + +def chatglm_tokenize(ctx: ChatGLMContext, prompt: str) -> List[int]: + return ctx.pipeline.tokenizer.encode(prompt) + + +def chatglm_detokenize(ctx: ChatGLMContext, input_ids: List[int]) -> str: + return ctx.pipeline.tokenizer.decode(input_ids) + + +def chatglm_eval(ctx: ChatGLMContext, + input_ids: List[int], + n_past: int, + do_sample: bool = True, + top_k: int = 0, + top_p: float = 0.7, + temperature: float = 0.95, + ) -> int: + ctx.config.do_sample = do_sample + ctx.config.top_k = top_k + ctx.config.top_p = top_p + ctx.temperature = temperature + return ctx.pipeline.model.generate_next_token(input_ids, ctx.config, n_past, + ctx.config.max_context_length) + + +def chatglm_eos_token(ctx: ChatGLMContext): + return ctx.pipeline.model.config.eos_token_id