Remove chatglm_C Module to Eliminate LGPL Dependency (#11178)

* remove chatglm_C.**.pyd to solve ngsolve weak copyright vunl * fix style check error * remove chatglm native int4 from langchain
2024-05-31 17:03:11 +08:00 · 2024-05-31 17:03:11 +08:00 · 401013a630
commit 401013a630
parent 50b5f4476f
14 changed files with 19 additions and 690 deletions
--- a/.github/workflows/llm-binary-build.yml
+++ b/.github/workflows/llm-binary-build.yml
@ -72,12 +72,6 @@ jobs:
          export http_proxy=${HTTP_PROXY}
          export https_proxy=${HTTPS_PROXY}
          yum install -y gcc-toolset-11 cmake git
-          conda remove -n python39 --all -y
-          conda create -n python39 python=3.9 -y
-          conda remove -n python310 --all -y
-          conda create -n python310 python=3.10 -y
-          conda remove -n python311 --all -y
-          conda create -n python311 python=3.11 -y
      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
        with:
          repository: "intel-analytics/llm.cpp"
@ -109,42 +103,6 @@ jobs:
          mv build/libstarcoder-api.so release/libstarcoder-api.so
          mv build/quantize-starcoder release/quantize-starcoder
          mv build/libstarcoder.so release/libstarcoder_avxvnni.so
-      - name: Build Chatglm
-        shell: bash
-        run: |
-          source activate python39 || conda activate python39
-          cd src/chatglm
-          scl enable gcc-toolset-11 "cmake -B build"
-          scl enable gcc-toolset-11 "cmake --build build --config Release -j"
-      - name: Move Chatglm binaries
-        shell: bash
-        run: |
-          mv src/chatglm/build/main release/main-chatglm_vnni
-          mv src/chatglm/build/_C.cpython-39-x86_64-linux-gnu.so release/chatglm_C.cpython-39-x86_64-linux-gnu.so
-      - name: Build Chatglm Py310
-        shell: bash
-        run: |
-          source activate python310 || conda activate python310
-          cd src/chatglm
-          rm -r build
-          scl enable gcc-toolset-11 "cmake -B build"
-          scl enable gcc-toolset-11 "cmake --build build --config Release -j"
-      - name: Move Chatglm binaries Py310
-        shell: bash
-        run: |
-          mv src/chatglm/build/_C.cpython-310-x86_64-linux-gnu.so release/chatglm_C.cpython-310-x86_64-linux-gnu.so
-      - name: Build Chatglm Py311
-        shell: bash
-        run: |
-          source activate python311 || conda activate python311
-          cd src/chatglm
-          rm -r build
-          scl enable gcc-toolset-11 "cmake -B build"
-          scl enable gcc-toolset-11 "cmake --build build --config Release -j"
-      - name: Move Chatglm binaries Py311
-        shell: bash
-        run: |
-          mv src/chatglm/build/_C.cpython-311-x86_64-linux-gnu.so release/chatglm_C.cpython-311-x86_64-linux-gnu.so
      - name: Archive build files
        uses: actions/upload-artifact@v3
        with:
@ -155,9 +113,6 @@ jobs:
        shell: bash
        run: |
          make clean
-          conda remove -n python39 --all -y
-          conda remove -n python310 --all -y
-          conda remove -n python311 --all -y

  check-linux-avx512-artifact:
    if: ${{contains(inputs.platform, 'Linux')}}
@ -286,8 +241,6 @@ jobs:
          export http_proxy=${HTTP_PROXY}
          export https_proxy=${HTTPS_PROXY}
          yum install -y gcc-toolset-11 cmake git
-          conda remove -n python39 --all -y
-          conda create -n python39 python=3.9 -y
      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
        with:
          repository: "intel-analytics/llm.cpp"
@ -299,11 +252,6 @@ jobs:
        run: |
          scl enable gcc-toolset-11 "cmake -DONLYAVX=OFF -DONLYAVX2=OFF -B build"
          scl enable gcc-toolset-11 "cmake --build build --config Release -j"
-          # build chatglm
-          source activate python39 || conda activate python39
-          cd src/chatglm
-          scl enable gcc-toolset-11 "cmake -B build"
-          scl enable gcc-toolset-11 "cmake --build build --config Release -j"
      - name: Move amx release binary
        shell: bash
        run: |
@ -316,9 +264,6 @@ jobs:
          mv build/libgptneox.so amx_release/libgptneox_amx.so
          mv build/quantize-starcoder amx_release/quantize-starcoder_amx
          mv build/libstarcoder.so amx_release/libstarcoder_amx.so
-          # chatglm binary files
-          mv src/chatglm/build/main amx_release/main-chatglm_amx
-          # mv src/chatglm/build/_C.cpython-39-x86_64-linux-gnu.so amx_release/chatglm_C.cpython-39-x86_64-linux-gnu.so
      - name: Archive amx build files
        uses: actions/upload-artifact@v3
        with:
@ -329,7 +274,6 @@ jobs:
        shell: bash
        run: |
          make clean
-          conda remove -n python39 --all -y
          
  check-windows-avx2-artifact:
    if: ${{contains(inputs.platform, 'Windows')}}
@ -393,10 +337,6 @@ jobs:
    needs: check-windows-avx-vnni-artifact
    if: needs.check-windows-avx-vnni-artifact.outputs.if-exists == 'false'
    steps:
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.9"
      - name: Set access token
        run: |
          echo "github_access_token=$env:GITHUB_ACCESS_TOKEN" >> $env:GITHUB_ENV
@ -438,47 +378,6 @@ jobs:
          # mv build/Release/main-starcoder.exe release/main-starcoder_vnni.exe
          mv build/Release/quantize-starcoder.exe release/quantize-starcoder_vnni.exe
          mv build/Release/starcoder.dll release/libstarcoder_vnni.dll
-      - name: Build Chatglm
-        shell: powershell
-        run: |
-          cd src/chatglm
-          cmake -DAVXVNNI=ON -B build
-          cmake --build build --config Release -j
-      - name: Move Chatglm binaries
-        shell: powershell
-        run: |
-          mv src/chatglm/build/Release/main.exe release/main-chatglm_vnni.exe
-          mv src/chatglm/build/Release/_C.cp39-win_amd64.pyd release/chatglm_C.cp39-win_amd64.pyd
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.10"
-      - name: Build Chatglm Py310
-        shell: powershell
-        run: |
-          cd src/chatglm
-          rm -r build
-          cmake -DAVXVNNI=ON -B build
-          cmake --build build --config Release -j
-      - name: Move Chatglm binaries Py310
-        shell: powershell
-        run: |
-          mv src/chatglm/build/Release/_C.cp310-win_amd64.pyd release/chatglm_C.cp310-win_amd64.pyd
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-      - name: Build Chatglm Py311
-        shell: powershell
-        run: |
-          cd src/chatglm
-          rm -r build
-          cmake -DAVXVNNI=ON -B build
-          cmake --build build --config Release -j
-      - name: Move Chatglm binaries Py311
-        shell: powershell
-        run: |
-          mv src/chatglm/build/Release/_C.cp311-win_amd64.pyd release/chatglm_C.cp311-win_amd64.pyd
      - name: Archive build files
        uses: actions/upload-artifact@v3
        with:
--- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md
@ -31,7 +31,7 @@ You may also convert Hugging Face *Transformers* models into native INT4 format,
 ```eval_rst
 .. note::

-   * Currently only llama/bloom/gptneox/starcoder/chatglm model families are supported; for other models, you may use the Hugging Face ``transformers`` INT4 format as described `above <./langchain_api.html#using-hugging-face-transformers-int4-format>`_.
+   * Currently only llama/bloom/gptneox/starcoder model families are supported; for other models, you may use the Hugging Face ``transformers`` INT4 format as described `above <./langchain_api.html#using-hugging-face-transformers-int4-format>`_.

   * You may choose the corresponding API developed for specific native models to load the converted model.
 ```
@ -41,9 +41,9 @@ from ipex_llm.langchain.llms import LlamaLLM
 from ipex_llm.langchain.embeddings import LlamaEmbeddings
 from langchain.chains.question_answering import load_qa_chain

-# switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
+# switch to GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
 embeddings = LlamaEmbeddings(model_path='/path/to/converted/model.bin')
-# switch to ChatGLMLLM/GptneoxLLM/BloomLLM/StarcoderLLM to load other models
+# switch to GptneoxLLM/BloomLLM/StarcoderLLM to load other models
 ipex_llm = LlamaLLM(model_path='/path/to/converted/model.bin')

 doc_chain = load_qa_chain(ipex_llm, ...)
--- a/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst
+++ b/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst
@ -31,7 +31,7 @@ IPEX-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which imp
 Native Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, you could also use the following LLM wrappers with the native (cpp) implementation for maximum performance.
+For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also use the following LLM wrappers with the native (cpp) implementation for maximum performance.

 .. tabs::

@ -47,18 +47,6 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
            .. automethod:: stream
            .. automethod:: get_num_tokens

-    .. tab:: ChatGLM
-
-        .. autoclass:: ipex_llm.langchain.llms.ChatGLMLLM
-            :members:
-            :undoc-members:
-            :show-inheritance:
-            :exclude-members: ggml_model, ggml_module, client, model_path, kwargs
-
-            .. automethod:: validate_environment
-            .. automethod:: stream
-            .. automethod:: get_num_tokens
-
    .. tab:: Bloom

        .. autoclass:: ipex_llm.langchain.llms.BloomLLM
--- a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py
+++ b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py
@ -36,8 +36,7 @@ def load(model_path, model_family, n_threads):
        "llama": LlamaForCausalLM,
        "gptneox": GptneoxForCausalLM,
        "bloom": BloomForCausalLM,
-        "starcoder": StarcoderForCausalLM,
-        "chatglm": ChatGLMForCausalLM
+        "starcoder": StarcoderForCausalLM
    }

    if model_family in model_family_to_class:
@ -55,7 +54,7 @@ def load(model_path, model_family, n_threads):

 def inference(llm, repo_id_or_model_path, model_family, prompt):

-    if model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm']:
+    if model_family in ['llama', 'gptneox', 'bloom', 'starcoder']:
        # ------ Option 1: Use IPEX-LLM based tokenizer
        print('-'*20, ' IPEX-LLM based tokenizer ', '-'*20)
        st = time.time()
@ -109,9 +108,9 @@ def main():
    parser.add_argument('--thread-num', type=int, default=2, required=True,
                        help='Number of threads to use for inference')
    parser.add_argument('--model-family', type=str, default='llama', required=True,
-                        choices=["llama", "llama2", "bloom", "gptneox", "starcoder", "chatglm"],
+                        choices=["llama", "llama2", "bloom", "gptneox", "starcoder"],
                        help="The model family of the large language model (supported option: 'llama', 'llama2', "
-                             "'gptneox', 'bloom', 'starcoder', 'chatglm')")
+                             "'gptneox', 'bloom', 'starcoder')")
    parser.add_argument('--repo-id-or-model-path', type=str, required=True,
                        help='The path to the huggingface checkpoint folder')
    parser.add_argument('--prompt', type=str, default='Once upon a time, there existed a little girl who liked to have adventures. ',
--- a/python/llm/setup.py
+++ b/python/llm/setup.py
@ -86,12 +86,7 @@ windows_binarys = [
    "quantize-llama_vnni.exe",
    "quantize-gptneox_vnni.exe",
    "quantize-bloom_vnni.exe",
-    "quantize-starcoder_vnni.exe",
-
-    "main-chatglm_vnni.exe",
-    "chatglm_C.cp39-win_amd64.pyd",
-    "chatglm_C.cp310-win_amd64.pyd",
-    "chatglm_C.cp311-win_amd64.pyd"
+    "quantize-starcoder_vnni.exe"
 ]
 linux_binarys = [
    "libllama_avx.so",
@ -125,13 +120,7 @@ linux_binarys = [
    "main-llama",
    "main-gptneox",
    "main-bloom",
-    "main-starcoder",
-
-    "main-chatglm_vnni",
-    "main-chatglm_amx",
-    "chatglm_C.cpython-39-x86_64-linux-gnu.so",
-    "chatglm_C.cpython-310-x86_64-linux-gnu.so",
-    "chatglm_C.cpython-311-x86_64-linux-gnu.so"
+    "main-starcoder"
 ]

 ext_lib_urls = [
--- a/python/llm/src/ipex_llm/ggml/convert.py
+++ b/python/llm/src/ipex_llm/ggml/convert.py
@ -76,10 +76,6 @@ def _convert_starcoder(model_path, outfile_dir, outtype):
    _convert_starcoder_hf_to_ggml(model_path, outfile_dir, outtype)


-def _convert_chatglm(model_path, outfile_dir, outtype):
-    return _convert_chatglm_hf_to_ggml(model_path, outfile_dir, outtype)
-
-
 def _convert_to_ggml(model_path: str, outfile_dir: str,
                     model_family: str = 'llama', outtype: str="fp16"):
    """
--- a/python/llm/src/ipex_llm/ggml/convert_model.py
+++ b/python/llm/src/ipex_llm/ggml/convert_model.py
@ -16,7 +16,7 @@
 import os
 import time
 from pathlib import Path
-from ipex_llm.ggml.convert import _convert_to_ggml, _convert_chatglm
+from ipex_llm.ggml.convert import _convert_to_ggml
 from ipex_llm.ggml.quantize import quantize
 from ipex_llm.utils.common import invalidInputError
 import argparse
@ -54,9 +54,9 @@ def convert_model(input_path: str,
    # make sure directory exists
    os.makedirs(output_path, exist_ok=True)
    # check input value
-    invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder', 'chatglm'],
+    invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'],
                      "Now we only support quantization of model \
-                       family('llama', 'bloom', 'gptneox', 'starcoder', 'chatglm')",
+                       family('llama', 'bloom', 'gptneox', 'starcoder')",
                      "{} is not in the list.".format(model_family))
    invalidInputError(os.path.isdir(output_path),
                      "The output_path {} was not a directory".format(output_path))
@ -78,12 +78,6 @@ def convert_model(input_path: str,
                          family('llama', 'gptneox', 'starcoder')",
                          "{} is not in the list.".format(model_family))

-    # chatglm merges convertion and quantization into one operation.
-    if model_family == 'chatglm':
-        return _convert_chatglm(model_path=input_path,
-                                outfile_dir=output_path,
-                                outtype=dtype)
-
    if tmp_path is not None:
        model_name = Path(input_path).stem
        tmp_ggml_file_path = os.path.join(tmp_path, f'{model_name}_{int(time.time())}')
--- a/python/llm/src/ipex_llm/ggml/model/chatglm/init.py
+++ b/python/llm/src/ipex_llm/ggml/model/chatglm/init.py
@ -1,22 +0,0 @@
-#
-# Copyright 2016 The BigDL Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This would makes sure Python is aware there is more than one sub-package within bigdl,
-# physically located elsewhere.
-# Otherwise there would be module not found error in non-pip's setting as Python would
-# only search the first bigdl package and end up finding only one sub-package.
-
-from .chatglm import ChatGLM
--- a/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm.py
+++ b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm.py
@ -1,428 +0,0 @@
-#
-# Copyright 2016 The BigDL Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ===========================================================================
-#
-# This file is adapted from
-# https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py
-#
-# MIT License
-#
-# Copyright (c) 2023 Andrei Betlen
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# This would makes sure Python is aware there is more than one sub-package within bigdl,
-# physically located elsewhere.
-# Otherwise there would be module not found error in non-pip's setting as Python would
-# only search the first bigdl package and end up finding only one sub-package.
-
-
-from .chatglm_cpp import chatglm_load, chatglm_tokenize, chatglm_detokenize, \
-    chatglm_forward, chatglm_eos_token
-from ipex_llm.utils.common import invalidInputError
-from ipex_llm.ggml.model.generation import GenerationMixin
-from typing import List, Optional, Generator, Sequence, Union
-import time
-import uuid
-import warnings
-
-
-class ChatGLM(GenerationMixin):
-    """High-level Python wrapper for a chatglm.cpp model."""
-
-    def __init__(
-        self,
-        model_path: str,
-        n_ctx: int = 512,
-        n_parts: int = -1,
-        n_gpu_layers: int = 0,
-        seed: int = -1,
-        f16_kv: bool = True,
-        logits_all: bool = False,
-        vocab_only: bool = False,
-        use_mmap: bool = False,
-        use_mlock: bool = False,
-        embedding: bool = False,
-        n_threads: Optional[int] = -1,
-        n_batch: int = 512,
-        last_n_tokens_size: int = 64,
-        lora_base: Optional[str] = None,
-        lora_path: Optional[str] = None,
-        verbose: bool = True,
-    ):
-        """Load a chatglm.cpp model from `model_path`.
-
-        Args:
-            model_path: Path to the model.
-            n_ctx: Maximum context size.
-            n_parts: Number of parts to split the model into. If -1, the number of parts
-            is automatically determined.
-            seed: Random seed. For default value -1, current timestamp is used as seed.
-            f16_kv: Use half-precision for key/value cache.
-            logits_all: Return logits for all tokens, not just the last token.
-            vocab_only: Only load the vocabulary no weights.
-            use_mmap: Use mmap if possible.
-            use_mlock: Force the system to keep the model in RAM.
-            embedding: Embedding mode only.
-            n_threads: Number of threads to use. Default to be -1, means auto.
-            n_batch: Maximum number of prompt tokens to batch together when calling chatglm_eval.
-            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
-            lora_base: Optional path to base model, useful if using a quantized base model and
-            you want to apply LoRA to an f16 model.
-            lora_path: Path to a LoRA file to apply to the model.
-            verbose: Print verbose output to stderr.
-
-        Raises:
-            ValueError: If the model path does not exist.
-
-        Returns:
-            A ChatGLM instance.
-        """
-
-        self.model_path = model_path
-        self.ctx = chatglm_load(model_path, use_mmap=use_mmap, n_ctx=n_ctx, n_threads=n_threads)
-        self.n_ctx = n_ctx
-        self.n_parts = n_parts
-        self.n_gpu_layers = n_gpu_layers
-        self.f16_kv = f16_kv
-        self.seed = seed
-        self.logits_all = logits_all
-        self.vocab_only = vocab_only
-        self.use_mmap = use_mmap
-        self.use_mlock = use_mlock
-        self.embedding = embedding
-        self.n_threads = n_threads
-        self.n_batch = n_batch
-        self.last_n_tokens_size = last_n_tokens_size
-        self.lora_base = lora_base
-        self.lora_path = lora_path
-        self.verbose = verbose
-        # TODO: Some parameters are temporarily not supported
-        unsupported_arg = {'n_parts': -1, 'n_gpu_layers': 0, 'f16_kv': True, 'logits_all': False,
-                           'vocab_only': False, 'use_mlock': False, 'embedding': False,
-                           'n_batch': 512, 'last_n_tokens_size': 64, 'lora_base': None,
-                           'lora_path': None, 'verbose': True}
-        for arg in unsupported_arg.keys():
-            if getattr(self, arg) != unsupported_arg[arg]:
-                warnings.warn(f"The parameter {arg} is temporarily unsupported, "
-                              "please use the default value.")
-
-    def __call__(
-        self,
-        prompt: str,
-        suffix: Optional[str] = None,
-        max_tokens: int = 128,
-        temperature: float = 0.95,
-        top_p: float = 0.7,
-        logprobs: Optional[int] = None,
-        echo: bool = False,
-        stop: Optional[Union[str, List[str]]]=[],
-        frequency_penalty: float = 0.0,
-        presence_penalty: float = 0.0,
-        repeat_penalty: float = 1.1,
-        top_k: int = 0,
-        stream: bool = False,
-        tfs_z: float = 1.0,
-        mirostat_mode: int = 0,
-        mirostat_tau: float = 5.0,
-        mirostat_eta: float = 0.1,
-        model: Optional[str] = None,
-    ):
-        # TODO: Some parameters are temporarily not supported
-        # Unsupported parameters are checked in `_supported_call`
-        return self._supported_call(prompt, max_tokens, stream, temperature, top_p, top_k,
-                                    stop, model, suffix, logprobs, echo, frequency_penalty,
-                                    presence_penalty, repeat_penalty, tfs_z, mirostat_mode,
-                                    mirostat_tau, mirostat_eta)
-
-    def _supported_call(self, prompt: str, max_tokens: int, stream: bool,
-                        temperature: float, top_p: float, top_k: int,
-                        stop: Optional[List[str]] = [], model: Optional[str] = None, *args):
-        # Check unsupporeted parameters
-        unsupported_arg = ['suffix', 'logprobs', 'echo',
-                           'frequency_penalty', 'presence_penalty', 'repeat_penalty',
-                           'tfs_z', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'model']
-        defult_value = {'suffix': None, 'logprobs': None, 'echo': False,
-                        'frequency_penalty': 0.0, 'presence_penalty': 0.0,
-                        'repeat_penalty': 1.1, 'tfs_z': 1.0, 'mirostat_mode': 0,
-                        'mirostat_tau': 5.0, 'mirostat_eta': 0.1}
-        for index in range(len(args)):
-            if args[index] != defult_value[unsupported_arg[index]]:
-                warnings.warn(f"The parameter {unsupported_arg[index]} is temporarily "
-                              "unsupported, please use the default value.")
-
-        if stream:
-            return self.stream(prompt, max_tokens, temperature, top_p, top_k, stop, model)
-        else:
-            return self._eval(prompt, max_tokens, temperature, top_p, top_k, stop, model)
-
-    def _eval(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int,
-              stop: Optional[List[str]] = [], model: Optional[str] = None):
-
-        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
-        created: int = int(time.time())
-        if model is None:
-            model_name = self.model_path
-        else:
-            model_name = model
-
-        input_tokens = self._tokenize(prompt)
-        prompt_len = len(input_tokens)
-        if max_tokens < 1:
-            return {
-                "id": completion_id,
-                "object": "text_completion",
-                "created": created,
-                "model": model_name,
-                "choices": [
-                    {
-                        "text": prompt,
-                        "index": 0,
-                        "logprobs": None,
-                        "finish_reason": "length",
-                    }
-                ],
-                "usage":
-                {
-                    "prompt_tokens": prompt_len,
-                    "completion_tokens": 0,
-                    "total_tokens": prompt_len,
-                }
-            }
-
-        for i in range(max_tokens):
-            token = self.forward(input_ids=input_tokens,
-                                 top_k=top_k,
-                                 top_p=top_p,
-                                 temperature=temperature)
-            input_tokens.append(token)
-            if token == self.eos_token():
-                break
-
-        text = self.detokenize(input_tokens)
-        split_text = text[len(prompt):]
-        split_text.rstrip('<EFBFBD>')  # remove partial emoji
-        if stop != []:
-            for stop_word in stop:
-                split_text = split_text.split(stop_word)[0]
-        if split_text != text:
-            finish_reason = "stop"
-        else:
-            finish_reason = None
-        completion_len = len(input_tokens) - prompt_len
-        return {
-            "id": completion_id,
-            "object": "text_completion",
-            "created": created,
-            "model": model_name,
-            "choices": [
-                {
-                    "text": prompt + split_text,
-                    "index": 0,
-                    "logprobs": None,
-                    "finish_reason": finish_reason,
-                }
-            ],
-            "usage": {
-                "prompt_tokens": prompt_len,
-                "completion_tokens": completion_len,
-                "total_tokens": prompt_len + completion_len,
-            }
-        }
-
-    def stream(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int,
-               stop: Optional[List[str]] = [], model: Optional[str] = None):
-        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
-        created: int = int(time.time())
-        if model is None:
-            model_name = self.model_path
-        else:
-            model_name = model
-        input_tokens = self._tokenize(prompt)
-        prompt_len = len(input_tokens)
-        if max_tokens < 1:
-            yield {
-                "id": completion_id,
-                "object": "text_completion",
-                "created": created,
-                "model": model_name,
-                "choices": [
-                    {
-                        "text": prompt,
-                        "index": 0,
-                        "logprobs": None,
-                        "finish_reason": "length",
-                    }
-                ],
-                "usage": {
-                    "prompt_tokens": prompt_len
-                }
-            }
-        else:
-            history_text = prompt
-            for i in range(max_tokens):
-                token = self.forward(input_ids=input_tokens,
-                                     top_k=top_k,
-                                     top_p=top_p,
-                                     temperature=temperature)
-                input_tokens.append(token)
-                if token == self.eos_token():
-                    print('\n')
-                    break
-                text = self.detokenize(input_tokens)
-                if text.endswith('<EFBFBD>'):
-                    # generated new token is part of an emoji
-                    # (some emoji consists of multiple tokens)
-                    # continue to generate more tokens to decode this emoji
-                    continue
-                text = text[len(history_text):]
-                history_text += text
-                yield {
-                    "id": completion_id,
-                    "object": "text_completion",
-                    "created": created,
-                    "model": model_name,
-                    "choices": [
-                        {
-                            "text": text,
-                            "index": 0,
-                            "logprobs": None,
-                            "finish_reason": None,
-                        }
-                    ],
-                    "usage": {
-                        "prompt_tokens": prompt_len
-                    }
-                }
-
-    def _tokenize(self, text: str, *args) -> List[int]:
-        """Tokenize a string.
-
-        Args:
-            text: The string to tokenize.
-
-        Raises:
-            RuntimeError: If the tokenization failed.
-
-        Returns:
-            A list of tokens.
-        """
-        warnings.warn("The parameter `add_bos` is unsupported, please use the default value.")
-        return chatglm_tokenize(self.ctx, text)
-
-    def detokenize(self, tokens: List[int]) -> str:
-        """Detokenize a list of tokens.
-
-        Args:
-            tokens: The list of tokens to detokenize.
-
-        Returns:
-            The detokenized string.
-        """
-        if isinstance(tokens, int):
-            tokens = [tokens]
-        return chatglm_detokenize(self.ctx, tokens)
-
-    def forward(self,
-                input_ids: List[int],
-                do_sample: bool = True,
-                top_k: int = 0,
-                top_p: float = 0.7,
-                temperature: float = 0.95,) -> int:
-        return chatglm_forward(ctx=self.ctx,
-                               input_ids=input_ids,
-                               do_sample=do_sample,
-                               top_k=top_k,
-                               top_p=top_p,
-                               temperature=temperature)
-
-    def eos_token(self) -> int:
-        return chatglm_eos_token(self.ctx)
-
-    def _generate(
-        self,
-        tokens: Sequence[int],
-        top_k: int = 0,
-        top_p: float = 0.7,
-        temp: float = 0.95,
-        repeat_penalty: float = 1.1,
-        reset: bool = True,
-        frequency_penalty: float = 0.0,
-        presence_penalty: float = 0.0,
-        tfs_z: float = 1.0,
-        mirostat_mode: int = 0,
-        mirostat_tau: float = 5.0,
-        mirostat_eta: float = 0.1,
-    ) -> Generator[int, Optional[Sequence[int]], None]:
-        """Create a generator of tokens from a prompt.
-
-        Examples:
-            >>> llm = ChatGLM(your_model_path)
-            >>> tokens = llm._tokenize(b"Learning English is")
-            >>> for token in llm._generate(tokens):
-            >>>     print(llm.detokenize([token]).decode("utf-8", errors="ignore"))
-
-        Args:
-            tokens: The prompt tokens.
-
-        Yields:
-            The generated tokens.
-        """
-        # TODO: Some parameters are temporarily not supported
-        # Unsupported parameters are checked in `_supported_generate`
-        return self._supported_generate(tokens, top_k, top_p, temp, repeat_penalty, reset,
-                                        frequency_penalty, presence_penalty, tfs_z, mirostat_mode,
-                                        mirostat_tau, mirostat_eta)
-
-    def _supported_generate(self, tokens: Sequence[int], top_k: int = 0, top_p: float = 0.7,
-                            temp: float = 0.95, *args):
-        # Check unsupporeted parameters
-        unsupported_arg = ['repeat_penalty', 'reset', 'frequency_penalty', 'presence_penalty',
-                           'tfs_z', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta']
-        defult_value = {'repeat_penalty': 1.1, 'reset': True, 'frequency_penalty': 0.0,
-                        'presence_penalty': 0.0, 'tfs_z': 1.0, 'mirostat_mode': 0,
-                        'mirostat_tau': 5.0, 'mirostat_eta': 0.1}
-        for index in range(len(args)):
-            if args[index] != defult_value[unsupported_arg[index]]:
-                warnings.warn(f"The parameter {unsupported_arg[index]} is temporarily "
-                              "unsupported, please use the default value.")
-
-        invalidInputError(self.ctx is not None, "The attribute `ctx` of `ChatGLM` object is None.")
-        while True:
-            token = self.forward(input_ids=tokens,
-                                 top_k=top_k,
-                                 top_p=top_p,
-                                 temperature=temp)
-            tokens_or_none = yield token
-            tokens.append(token)
-            if tokens_or_none is not None:
-                tokens.extend(tokens_or_none)
--- a/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm_cpp.py
+++ b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm_cpp.py
@ -1,72 +0,0 @@
-#
-# Copyright 2016 The BigDL Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This would makes sure Python is aware there is more than one sub-package within bigdl,
-# physically located elsewhere.
-# Otherwise there would be module not found error in non-pip's setting as Python would
-# only search the first bigdl package and end up finding only one sub-package.
-
-
-from typing import List
-from pathlib import Path
-
-from ipex_llm.libs.chatglm_C import Pipeline, GenerationConfig
-
-
-class ChatGLMContext:
-    def __init__(self, pipeline: Pipeline, config: GenerationConfig):
-        self.pipeline = pipeline
-        self.config = config
-
-
-def chatglm_load(path: str,
-                 n_ctx: int,
-                 n_threads: int,
-                 use_mmap: bool = False,
-                 ) -> ChatGLMContext:
-    path = str(Path(path))
-    pipeline = Pipeline(path, use_mmap)
-    config = GenerationConfig(
-        max_length=n_ctx,
-        num_threads=n_threads,
-    )
-    return ChatGLMContext(pipeline, config)
-
-
-def chatglm_tokenize(ctx: ChatGLMContext, prompt: str) -> List[int]:
-    return ctx.pipeline.tokenizer.encode(prompt)
-
-
-def chatglm_detokenize(ctx: ChatGLMContext, input_ids: List[int]) -> str:
-    return ctx.pipeline.tokenizer.decode(input_ids)
-
-
-def chatglm_forward(ctx: ChatGLMContext,
-                    input_ids: List[int],
-                    do_sample: bool = True,
-                    top_k: int = 0,
-                    top_p: float = 0.7,
-                    temperature: float = 0.95,
-                    ) -> int:
-    ctx.config.do_sample = do_sample
-    ctx.config.top_k = top_k
-    ctx.config.top_p = top_p
-    ctx.config.temperature = temperature
-    return ctx.pipeline.forward(input_ids, ctx.config)
-
-
-def chatglm_eos_token(ctx: ChatGLMContext):
-    return ctx.pipeline.model.config.eos_token_id
--- a/python/llm/src/ipex_llm/langchain/llms/init.py
+++ b/python/llm/src/ipex_llm/langchain/llms/init.py
@ -32,7 +32,6 @@ __all__ = [
    "LlamaLLM",
    "BloomLLM",
    "GptneoxLLM",
-    "ChatGLMLLM",
    "StarcoderLLM",
    "TransformersLLM",
    "TransformersPipelineLLM"
@ -43,7 +42,6 @@ type_to_cls_dict: Dict[str, Type[BaseLLM]] = {
    "LlamaLLM": LlamaLLM,
    "BloomLLM": BloomLLM,
    "GptneoxLLM": GptneoxLLM,
-    "ChatGLMLLM": ChatGLMLLM,
    "StarcoderLLM": StarcoderLLM,
    "TransformersPipelineLLM": TransformersPipelineLLM,
    "TransformersLLM": TransformersLLM
--- a/python/llm/src/ipex_llm/langchain/llms/bigdlllm.py
+++ b/python/llm/src/ipex_llm/langchain/llms/bigdlllm.py
@ -70,14 +70,13 @@ class BigdlNativeLLM(LLM):
                    "please switch to the new LLM API for sepcific models.")

    model_family: str = "llama"
-    """The model family: currently supports llama, gptneox, bloom, starcoder and chatglm."""
+    """The model family: currently supports llama, gptneox, bloom, starcoder."""

    family_info = {
        'llama': {'module': "ipex_llm.models" , 'class': "Llama"},
        'bloom': {'module': "ipex_llm.models", 'class': "Bloom"},
        'gptneox': {'module': "ipex_llm.models", 'class': "Gptneox"},
        'starcoder': {'module':"ipex_llm.models", 'class': "Starcoder"},
-        'chatglm': {'module':"ipex_llm.ggml.model.chatglm", 'class': "ChatGLM"},
    }  #: :meta private:
    """Info necessary for different model families initiation and configure."""

@ -688,11 +687,6 @@ class GptneoxLLM(_BaseCausalLM):
    ggml_module = "ipex_llm.models"


-class ChatGLMLLM(_BaseCausalLM):
-    ggml_model = "ChatGLM"
-    ggml_module = "ipex_llm.ggml.model.chatglm"
-
-
 class StarcoderLLM(_BaseCausalLM):
    ggml_model = "Starcoder"
    ggml_module = "ipex_llm.models"
--- a/python/llm/src/ipex_llm/models.py
+++ b/python/llm/src/ipex_llm/models.py
@ -23,5 +23,3 @@ from ipex_llm.ggml.model.llama import Llama
 from ipex_llm.ggml.model.gptneox import Gptneox
 from ipex_llm.ggml.model.bloom import Bloom
 from ipex_llm.ggml.model.starcoder import Starcoder
-# temporarily disable until linux binary file for chatglm ready
-# from ipex_llm.ggml.model.chatglm import ChatGLM
--- a/python/llm/src/ipex_llm/transformers/modelling_bigdl.py
+++ b/python/llm/src/ipex_llm/transformers/modelling_bigdl.py
@ -42,8 +42,7 @@ class BigdlNativeForCausalLM:
        :param pretrained_model_name_or_path: Path for converted BigDL-LLM optimized ggml
               binary checkpoint. The checkpoint should be converted by ``ipex_llm.llm_convert``.
        :param model_family: The model family of the pretrained checkpoint.
-               Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"``, ``"starcoder"``
-               and ``"chatglm"``.
+               Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"``, ``"starcoder"``.
        :param dtype: Which quantized precision will be converted.
                Now only `int4` and `int8` are supported, and `int8` only works for `llama`
                , `gptneox` and `starcoder`.
@ -58,9 +57,9 @@ class BigdlNativeForCausalLM:
        """
        logging.warning("BigdlNativeForCausalLM has been deprecated, "
                        "please switch to the new CausalLM API for sepcific models.")
-        invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm'],
+        invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder'],
                          "Now we only support model family: 'llama', 'gptneox', 'bloom',"
-                          " 'starcoder', 'chatglm', '{}' is not in the list.".format(model_family))
+                          " 'starcoder', '{}' is not in the list.".format(model_family))
        invalidInputError(dtype.lower() in ['int4', 'int8'],
                          "Now we only support int4 and int8 as date type for weight")

@ -78,9 +77,6 @@ class BigdlNativeForCausalLM:
        elif model_family == 'starcoder':
            from ipex_llm.ggml.model.starcoder import Starcoder
            return Starcoder(model_path=ggml_model_path, **kwargs)
-        elif model_family == 'chatglm':
-            from ipex_llm.ggml.model.chatglm import ChatGLM
-            return ChatGLM(model_path=ggml_model_path, **kwargs)


 class _BaseGGMLClass:
@ -110,9 +106,9 @@ class _BaseGGMLClass:
        :return: a model instance
        """
        try:
-            module = importlib.import_module(cls.GGML_Module)
-            class_ = getattr(module, cls.GGML_Model)
            if native:
+                module = importlib.import_module(cls.GGML_Module)
+                class_ = getattr(module, cls.GGML_Model)
                invalidInputError(dtype.lower() in ['int4', 'int8'],
                                  "Now we only support int4 and int8 as date type for weight")
                ggml_model_path = pretrained_model_name_or_path