From ef08250c2125e99b4dd65ac69db884aaa61317a3 Mon Sep 17 00:00:00 2001
From: Yishuo Wang <yishuo.wang@intel.com>
Date: Fri, 4 Aug 2023 14:27:29 +0800
Subject: [PATCH] [LLM] chatglm pybinding support (#8672)

---
 python/llm/setup.py                           |   5 +-
 .../bigdl/llm/ggml/model/chatglm/__init__.py  |  20 +
 .../bigdl/llm/ggml/model/chatglm/chatglm.py   | 373 ++++++++++++++++++
 .../llm/ggml/model/chatglm/chatglm_cpp.py     |  74 ++++
 4 files changed, 470 insertions(+), 2 deletions(-)
 create mode 100644 python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py
 create mode 100644 python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
 create mode 100644 python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py

diff --git a/python/llm/setup.py b/python/llm/setup.py
index 4591d22e..16187aff 100644
--- a/python/llm/setup.py
+++ b/python/llm/setup.py
@@ -80,7 +80,7 @@ windows_binarys = [
     "quantize-gptneox_vnni.exe",
     "quantize-bloom_vnni.exe",
     "quantize-starcoder_vnni.exe",
-    
+
     "main-chatglm_vnni.exe",
 ]
 linux_binarys = [
@@ -112,7 +112,7 @@ linux_binarys = [
     "main-gptneox",
     "main-bloom",
     "main-starcoder",
-    
+
     "main-chatglm_vnni",
 ]
 
@@ -220,6 +220,7 @@ def setup_package():
         print(f"Deleting existing libs_dir {libs_dir} ....")
         shutil.rmtree(libs_dir)
     os.makedirs(libs_dir, exist_ok=True)
+    open(os.path.join(libs_dir, "__init__.py"), 'w').close()
 
     # copy built files for github workflow
     for built_file in glob.glob(os.path.join(github_artifact_dir, '*')):
diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py b/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py
new file mode 100644
index 00000000..dbdafd2a
--- /dev/null
+++ b/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py
@@ -0,0 +1,20 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This would makes sure Python is aware there is more than one sub-package within bigdl,
+# physically located elsewhere.
+# Otherwise there would be module not found error in non-pip's setting as Python would
+# only search the first bigdl package and end up finding only one sub-package.
diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
new file mode 100644
index 00000000..66be18cd
--- /dev/null
+++ b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
@@ -0,0 +1,373 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===========================================================================
+#
+# This file is adapted from
+# https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py
+#
+# MIT License
+#
+# Copyright (c) 2023 Andrei Betlen
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# This would makes sure Python is aware there is more than one sub-package within bigdl,
+# physically located elsewhere.
+# Otherwise there would be module not found error in non-pip's setting as Python would
+# only search the first bigdl package and end up finding only one sub-package.
+
+
+from .chatglm_cpp import chatglm_load, chatglm_tokenize, chatglm_detokenize, chatglm_eval, \
+    chatglm_eos_token
+from bigdl.llm.utils.common import invalidInputError
+from bigdl.llm.ggml.model.generation import GenerationMixin
+from typing import List, Optional, Generator, Sequence, Union
+import time
+import uuid
+import warnings
+
+
+class ChatGLM:
+    """High-level Python wrapper for a chatglm.cpp model."""
+
+    def __init__(
+        self,
+        model_path: str,
+        n_ctx: int = 512,
+        n_parts: int = -1,
+        n_gpu_layers: int = 0,
+        seed: int = -1,
+        f16_kv: bool = True,
+        logits_all: bool = False,
+        vocab_only: bool = False,
+        use_mmap: bool = False,
+        use_mlock: bool = False,
+        embedding: bool = False,
+        n_threads: Optional[int] = 2,
+        n_batch: int = 512,
+        last_n_tokens_size: int = 64,
+        lora_base: Optional[str] = None,
+        lora_path: Optional[str] = None,
+        verbose: bool = True,
+    ):
+        """Load a chatglm.cpp model from `model_path`.
+
+        Args:
+            model_path: Path to the model.
+            n_ctx: Maximum context size.
+            n_parts: Number of parts to split the model into. If -1, the number of parts
+            is automatically determined.
+            seed: Random seed. For default value -1, current timestamp is used as seed.
+            f16_kv: Use half-precision for key/value cache.
+            logits_all: Return logits for all tokens, not just the last token.
+            vocab_only: Only load the vocabulary no weights.
+            use_mmap: Use mmap if possible.
+            use_mlock: Force the system to keep the model in RAM.
+            embedding: Embedding mode only.
+            n_threads: Number of threads to use. Default to be 2.
+            n_batch: Maximum number of prompt tokens to batch together when calling chatglm_eval.
+            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
+            lora_base: Optional path to base model, useful if using a quantized base model and
+            you want to apply LoRA to an f16 model.
+            lora_path: Path to a LoRA file to apply to the model.
+            verbose: Print verbose output to stderr.
+
+        Raises:
+            ValueError: If the model path does not exist.
+
+        Returns:
+            A ChatGLM instance.
+        """
+
+        self.model_path = model_path
+        self.ctx = chatglm_load(model_path, use_mmap=use_mmap, n_ctx=n_ctx, n_threads=n_threads)
+        self.n_ctx = n_ctx
+        self.n_parts = n_parts
+        self.n_gpu_layers = n_gpu_layers
+        self.f16_kv = f16_kv
+        self.seed = seed
+        self.logits_all = logits_all
+        self.vocab_only = vocab_only
+        self.use_mmap = use_mmap
+        self.use_mlock = use_mlock
+        self.embedding = embedding
+        self.n_threads = n_threads
+        self.n_batch = n_batch
+        self.last_n_tokens_size = last_n_tokens_size
+        self.lora_base = lora_base
+        self.lora_path = lora_path
+        self.verbose = verbose
+        # TODO: Some parameters are temporarily not supported
+        unsupported_arg = {'n_parts': -1, 'n_gpu_layers': 0, 'f16_kv': True, 'logits_all': False,
+                           'vocab_only': False, 'use_mlock': False, 'embedding': False,
+                           'n_batch': 512, 'last_n_tokens_size': 64, 'lora_base': None,
+                           'lora_path': None, 'verbose': True}
+        for arg in unsupported_arg.keys():
+            if getattr(self, arg) != unsupported_arg[arg]:
+                warnings.warn(f"The parameter {arg} is temporarily unsupported, "
+                              "please use the default value.")
+
+    def __call__(
+        self,
+        prompt: str,
+        suffix: Optional[str] = None,
+        max_tokens: int = 128,
+        temperature: float = 0.95,
+        top_p: float = 0.7,
+        logprobs: Optional[int] = None,
+        echo: bool = False,
+        stop: Optional[Union[str, List[str]]]=[],
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        repeat_penalty: float = 1.1,
+        top_k: int = 0,
+        stream: bool = False,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+    ):
+        # TODO: Some parameters are temporarily not supported
+        # Unsupported parameters are checked in `_supported_call`
+        return self._supported_call(prompt, max_tokens, stream, temperature, top_p, top_k,
+                                    stop, model, suffix, logprobs, echo, frequency_penalty,
+                                    presence_penalty, repeat_penalty, tfs_z, mirostat_mode,
+                                    mirostat_tau, mirostat_eta)
+
+    def _supported_call(self, prompt: str, max_tokens: int, stream: bool,
+                        temperature: float, top_p: float, top_k: int,
+                        stop: Optional[List[str]] = [], model: Optional[str] = None, *args):
+        # Check unsupporeted parameters
+        unsupported_arg = ['suffix', 'logprobs', 'echo',
+                           'frequency_penalty', 'presence_penalty', 'repeat_penalty',
+                           'tfs_z', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'model']
+        defult_value = {'suffix': None, 'logprobs': None, 'echo': False,
+                        'frequency_penalty': 0.0, 'presence_penalty': 0.0,
+                        'repeat_penalty': 1.1, 'tfs_z': 1.0, 'mirostat_mode': 0,
+                        'mirostat_tau': 5.0, 'mirostat_eta': 0.1}
+        for index in range(len(args)):
+            if args[index] != defult_value[unsupported_arg[index]]:
+                warnings.warn(f"The parameter {unsupported_arg[index]} is temporarily "
+                              "unsupported, please use the default value.")
+
+        if stream:
+            return self.stream(prompt, max_tokens, temperature, top_p, top_k, stop, model)
+        else:
+            return self._eval(prompt, max_tokens, temperature, top_p, top_k, stop, model)
+
+    def _eval(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int,
+              stop: Optional[List[str]] = [], model: Optional[str] = None):
+
+        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
+        created: int = int(time.time())
+        if model is None:
+            model_name = self.model_path
+        else:
+            model_name = model
+
+        input_tokens = self._tokenize(prompt)
+        prompt_len = len(input_tokens)
+        if max_tokens < 1:
+            return {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": prompt,
+                        "index": 0,
+                        "logprobs": None,
+                        "finish_reason": "length",
+                    }
+                ],
+                "usage":
+                {
+                    "prompt_tokens": prompt_len,
+                    "completion_tokens": 0,
+                    "total_tokens": prompt_len,
+                }
+            }
+
+        n_past = 0
+        output_tokens = []
+        for i in range(max_tokens):
+            token = self.forward(input_ids=input_tokens,
+                                 n_past=n_past,
+                                 top_k=top_k,
+                                 top_p=top_p,
+                                 temperature=temperature)
+            output_tokens.append(token)
+            n_past += len(input_tokens)
+            input_tokens = [token]
+            if token == self.eos_token():
+                break
+
+        text = self.detokenize(output_tokens)
+        split_text = text
+        if stop != []:
+            for stop_word in stop:
+                split_text = split_text.split(stop_word)[0]
+        if split_text != text:
+            finish_reason = "stop"
+        else:
+            finish_reason = None
+        completion_len = n_past - prompt_len
+        return {
+            "id": completion_id,
+            "object": "text_completion",
+            "created": created,
+            "model": model_name,
+            "choices": [
+                {
+                    "text": prompt + split_text,
+                    "index": 0,
+                    "logprobs": None,
+                    "finish_reason": finish_reason,
+                }
+            ],
+            "usage": {
+                "prompt_tokens": prompt_len,
+                "completion_tokens": completion_len,
+                "total_tokens": prompt_len + completion_len,
+            }
+        }
+
+    def stream(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int,
+               stop: Optional[List[str]] = [], model: Optional[str] = None):
+        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
+        created: int = int(time.time())
+        if model is None:
+            model_name = self.model_path
+        else:
+            model_name = model
+        input_tokens = self._tokenize(prompt)
+        prompt_len = len(input_tokens)
+        if max_tokens < 1:
+            yield {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": prompt,
+                        "index": 0,
+                        "logprobs": None,
+                        "finish_reason": "length",
+                    }
+                ],
+                "usage":
+                    {
+                        "prompt_tokens": prompt_len
+                }
+            }
+        else:
+            n_past = 0
+            output_tokens = []
+            for i in range(max_tokens):
+                token = self.forward(input_ids=input_tokens,
+                                     n_past=n_past,
+                                     top_k=top_k,
+                                     top_p=top_p,
+                                     temperature=temperature)
+                output_tokens.append(token)
+                n_past += len(input_tokens)
+                input_tokens = [token]
+                if token == self.eos_token():
+                    print('\n')
+                    break
+                text = self.detokenize(token)
+                yield {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": created,
+                    "model": model_name,
+                    "choices": [
+                        {
+                            "text": text,
+                            "index": 0,
+                            "logprobs": None,
+                            "finish_reason": None,
+                        }
+                    ],
+                    "usage":
+                        {
+                            "prompt_tokens": prompt_len
+                    }
+                }
+
+    def _tokenize(self, text: bytes) -> List[int]:
+        """Tokenize a string.
+
+        Args:
+            text: The utf-8 encoded string to tokenize.
+
+        Raises:
+            RuntimeError: If the tokenization failed.
+
+        Returns:
+            A list of tokens.
+        """
+        return chatglm_tokenize(self.ctx, text)
+
+    def detokenize(self, tokens: List[int]) -> bytes:
+        """Detokenize a list of tokens.
+
+        Args:
+            tokens: The list of tokens to detokenize.
+
+        Returns:
+            The detokenized string.
+        """
+        if isinstance(tokens, int):
+            tokens = [tokens]
+        return chatglm_detokenize(self.ctx, tokens)
+
+    def forward(self,
+                input_ids: List[int],
+                n_past: int,
+                do_sample: bool = True,
+                top_k: int = 0,
+                top_p: float = 0.7,
+                temperature: float = 0.95,) -> int:
+        return chatglm_eval(ctx=self.ctx,
+                            input_ids=input_ids,
+                            n_past=n_past,
+                            do_sample=do_sample,
+                            top_k=top_k,
+                            top_p=top_p,
+                            temperature=temperature)
+
+    def eos_token(self) -> int:
+        return chatglm_eos_token(self.ctx)
diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py
new file mode 100644
index 00000000..12b1a45e
--- /dev/null
+++ b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py
@@ -0,0 +1,74 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This would makes sure Python is aware there is more than one sub-package within bigdl,
+# physically located elsewhere.
+# Otherwise there would be module not found error in non-pip's setting as Python would
+# only search the first bigdl package and end up finding only one sub-package.
+
+
+from typing import List
+from pathlib import Path
+
+from bigdl.llm.libs.chatglm_C import Pipeline, GenerationConfig
+
+
+class ChatGLMContext:
+    def __init__(self, pipeline: Pipeline, config: GenerationConfig):
+        self.pipeline = pipeline
+        self.config = config
+
+
+def chatglm_load(path: str,
+                 n_ctx: int,
+                 n_threads: int,
+                 use_mmap: bool = False,
+                 ) -> ChatGLMContext:
+    path = str(Path(path))
+    pipeline = Pipeline(path, use_mmap)
+    config = GenerationConfig(
+        max_context_length=n_ctx,
+        num_threads=n_threads,
+    )
+    return ChatGLMContext(pipeline, config)
+
+
+def chatglm_tokenize(ctx: ChatGLMContext, prompt: str) -> List[int]:
+    return ctx.pipeline.tokenizer.encode(prompt)
+
+
+def chatglm_detokenize(ctx: ChatGLMContext, input_ids: List[int]) -> str:
+    return ctx.pipeline.tokenizer.decode(input_ids)
+
+
+def chatglm_eval(ctx: ChatGLMContext,
+                 input_ids: List[int],
+                 n_past: int,
+                 do_sample: bool = True,
+                 top_k: int = 0,
+                 top_p: float = 0.7,
+                 temperature: float = 0.95,
+                 ) -> int:
+    ctx.config.do_sample = do_sample
+    ctx.config.top_k = top_k
+    ctx.config.top_p = top_p
+    ctx.temperature = temperature
+    return ctx.pipeline.model.generate_next_token(input_ids, ctx.config, n_past,
+                                                  ctx.config.max_context_length)
+
+
+def chatglm_eos_token(ctx: ChatGLMContext):
+    return ctx.pipeline.model.config.eos_token_id