diff --git a/python/llm/src/bigdl/llm/convert_model.py b/python/llm/src/bigdl/llm/convert_model.py
index 2600cb80..026dcae6 100644
--- a/python/llm/src/bigdl/llm/convert_model.py
+++ b/python/llm/src/bigdl/llm/convert_model.py
@@ -124,7 +124,7 @@ def main():
                         help=("outfile,save path of output quantized model."))
     parser.add_argument('-x', '--model-family', type=str, required=True,
                         help=("--model-family: Which model family your input model belongs to."
-                              "Now only `llama`/`bloom`/`gptneox` are supported."))
+                              "Now only `llama`/`bloom`/`gptneox`/`chatglm` are supported."))
     parser.add_argument('-f', '--model-format', type=str, required=True,
                         help=("The model type to be convert to a ggml compatible file."
                               "Now only `pth`/`gptq` are supported."))
diff --git a/python/llm/src/bigdl/llm/ggml/convert.py b/python/llm/src/bigdl/llm/ggml/convert.py
index 4ee2ce17..a332caf1 100644
--- a/python/llm/src/bigdl/llm/ggml/convert.py
+++ b/python/llm/src/bigdl/llm/ggml/convert.py
@@ -77,7 +77,7 @@ def _convert_starcoder(model_path, outfile_dir, outtype):
 
 
 def _convert_chatglm(model_path, outfile_dir, outtype):
-    _convert_chatglm_hf_to_ggml(model_path, outfile_dir, outtype)
+    return _convert_chatglm_hf_to_ggml(model_path, outfile_dir, outtype)
 
 
 def _convert_to_ggml(model_path: str, outfile_dir: str,
diff --git a/python/llm/src/bigdl/llm/ggml/convert_model.py b/python/llm/src/bigdl/llm/ggml/convert_model.py
index d17ee0a6..5b76dca7 100644
--- a/python/llm/src/bigdl/llm/ggml/convert_model.py
+++ b/python/llm/src/bigdl/llm/ggml/convert_model.py
@@ -80,10 +80,9 @@ def convert_model(input_path: str,
 
     # chatglm merges convertion and quantization into one operation.
     if model_family == 'chatglm':
-        _convert_chatglm(model_path=input_path,
-                         outfile_dir=output_path,
-                         outtype=dtype)
-        return
+        return _convert_chatglm(model_path=input_path,
+                                outfile_dir=output_path,
+                                outtype=dtype)
 
     if tmp_path is not None:
         model_name = Path(input_path).stem
diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py b/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py
index dbdafd2a..ac7933c5 100644
--- a/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py
+++ b/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py
@@ -18,3 +18,5 @@
 # physically located elsewhere.
 # Otherwise there would be module not found error in non-pip's setting as Python would
 # only search the first bigdl package and end up finding only one sub-package.
+
+from .chatglm import ChatGLM
diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
index 66be18cd..a6891705 100644
--- a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
+++ b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
@@ -56,7 +56,7 @@ import uuid
 import warnings
 
 
-class ChatGLM:
+class ChatGLM(GenerationMixin):
     """High-level Python wrapper for a chatglm.cpp model."""
 
     def __init__(
@@ -327,7 +327,7 @@ class ChatGLM:
                     }
                 }
 
-    def _tokenize(self, text: bytes) -> List[int]:
+    def _tokenize(self, text: bytes, *args) -> List[int]:
         """Tokenize a string.
 
         Args:
@@ -339,9 +339,10 @@ class ChatGLM:
         Returns:
             A list of tokens.
         """
+        warnings.warn("The parameter `add_bos` is unsupported, please use the default value.")
         return chatglm_tokenize(self.ctx, text)
 
-    def detokenize(self, tokens: List[int]) -> bytes:
+    def detokenize(self, tokens: List[int]) -> str:
         """Detokenize a list of tokens.
 
         Args:
@@ -371,3 +372,65 @@ class ChatGLM:
 
     def eos_token(self) -> int:
         return chatglm_eos_token(self.ctx)
+
+    def _generate(
+        self,
+        tokens: Sequence[int],
+        top_k: int = 0,
+        top_p: float = 0.7,
+        temp: float = 0.95,
+        repeat_penalty: float = 1.1,
+        reset: bool = True,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+    ) -> Generator[int, Optional[Sequence[int]], None]:
+        """Create a generator of tokens from a prompt.
+
+        Examples:
+            >>> llm = ChatGLM(your_model_path)
+            >>> tokens = llm._tokenize(b"Learning English is")
+            >>> for token in llm._generate(tokens):
+            >>>     print(llm.detokenize([token]).decode("utf-8", errors="ignore"))
+
+        Args:
+            tokens: The prompt tokens.
+
+        Yields:
+            The generated tokens.
+        """
+        # TODO: Some parameters are temporarily not supported
+        # Unsupported parameters are checked in `_supported_generate`
+        return self._supported_generate(tokens, top_k, top_p, temp, repeat_penalty, reset,
+                                        frequency_penalty, presence_penalty, tfs_z, mirostat_mode,
+                                        mirostat_tau, mirostat_eta)
+
+    def _supported_generate(self, tokens: Sequence[int], top_k: int = 0, top_p: float = 0.7,
+                            temp: float = 0.95, *args):
+        # Check unsupporeted parameters
+        unsupported_arg = ['repeat_penalty', 'reset', 'frequency_penalty', 'presence_penalty',
+                           'tfs_z', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta']
+        defult_value = {'repeat_penalty': 1.1, 'reset': True, 'frequency_penalty': 0.0,
+                        'presence_penalty': 0.0, 'tfs_z': 1.0, 'mirostat_mode': 0,
+                        'mirostat_tau': 5.0, 'mirostat_eta': 0.1}
+        for index in range(len(args)):
+            if args[index] != defult_value[unsupported_arg[index]]:
+                warnings.warn(f"The parameter {unsupported_arg[index]} is temporarily "
+                              "unsupported, please use the default value.")
+
+        invalidInputError(self.ctx is not None, "The attribute `ctx` of `ChatGLM` object is None.")
+        n_past = 0
+        while True:
+            token = self.forward(input_ids=tokens,
+                                 n_past=n_past,
+                                 top_k=top_k,
+                                 top_p=top_p,
+                                 temperature=temp)
+            n_past += len(tokens)
+            tokens_or_none = yield token
+            tokens = [token]
+            if tokens_or_none is not None:
+                tokens.extend(tokens_or_none)
diff --git a/python/llm/src/bigdl/llm/ggml/model/generation/utils.py b/python/llm/src/bigdl/llm/ggml/model/generation/utils.py
index a033d2e9..5e61bdfc 100644
--- a/python/llm/src/bigdl/llm/ggml/model/generation/utils.py
+++ b/python/llm/src/bigdl/llm/ggml/model/generation/utils.py
@@ -72,7 +72,11 @@ class GenerationMixin:
         :param tokens: list of ids that indicates the tokens, mostly generated by generate
         :return: decoded string
         '''
-        return self.detokenize(tokens).decode()
+        output = self.detokenize(tokens)
+        if isinstance(output, str):
+            return output
+        else:
+            return output.decode()
 
     def batch_decode(self,
                      tokens: Union[List[int], List[List[int]]]) -> str:
diff --git a/python/llm/src/bigdl/llm/models.py b/python/llm/src/bigdl/llm/models.py
index e81444b5..0a1a2123 100644
--- a/python/llm/src/bigdl/llm/models.py
+++ b/python/llm/src/bigdl/llm/models.py
@@ -23,3 +23,5 @@ from bigdl.llm.ggml.model.llama import Llama
 from bigdl.llm.ggml.model.gptneox import Gptneox
 from bigdl.llm.ggml.model.bloom import Bloom
 from bigdl.llm.ggml.model.starcoder import Starcoder
+# temporarily disable until linux binary file for chatglm ready
+# from bigdl.llm.ggml.model.chatglm import ChatGLM
diff --git a/python/llm/src/bigdl/llm/transformers/modelling_bigdl.py b/python/llm/src/bigdl/llm/transformers/modelling_bigdl.py
index dc89b573..9170f5ec 100644
--- a/python/llm/src/bigdl/llm/transformers/modelling_bigdl.py
+++ b/python/llm/src/bigdl/llm/transformers/modelling_bigdl.py
@@ -38,12 +38,13 @@ class BigdlNativeForCausalLM:
         :param pretrained_model_name_or_path: Path for converted BigDL-LLM optimized ggml
                binary checkpoint. The checkpoint should be converted by ``bigdl.llm.llm_convert``.
         :param model_family: The model family of the pretrained checkpoint.
-               Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"`` and ``"starcoder"``.
+               Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"``, ``"starcoder"``
+               and ``"chatglm"``.
         :param dtype: Which quantized precision will be converted.
                 Now only `int4` and `int8` are supported, and `int8` only works for `llama`
                 , `gptneox` and `starcoder`.
         :param cache_dir: (optional) This parameter will only be used when
-               ``pretrained_model_name_or_path`` is a hugginface checkpoint or hub repo id.
+               ``pretrained_model_name_or_path`` is a huggingface checkpoint or hub repo id.
                It indicates the saving path for the converted low precision model.
         :param tmp_path: (optional) Which path to store the intermediate fp16 model during the
                conversion process. Default to `None` so that intermediate model will not be saved.
@@ -51,9 +52,9 @@ class BigdlNativeForCausalLM:
 
         :return: a model instance
         """
-        invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder'],
+        invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm'],
                           "Now we only support model family: 'llama', 'gptneox', 'bloom',"
-                          " 'starcoder', '{}' is not in the list.".format(model_family))
+                          " 'starcoder', 'chatglm', '{}' is not in the list.".format(model_family))
         invalidInputError(dtype.lower() in ['int4', 'int8'],
                           "Now we only support int4 and int8 as date type for weight")
 
@@ -71,3 +72,6 @@ class BigdlNativeForCausalLM:
         elif model_family == 'starcoder':
             from bigdl.llm.ggml.model.starcoder import Starcoder
             return Starcoder(model_path=ggml_model_path, **kwargs)
+        elif model_family == 'chatglm':
+            from bigdl.llm.ggml.model.chatglm import ChatGLM
+            return ChatGLM(model_path=ggml_model_path, **kwargs)
diff --git a/python/llm/src/bigdl/llm/utils/convert_chatglm.py b/python/llm/src/bigdl/llm/utils/convert_chatglm.py
index e8759207..17698024 100644
--- a/python/llm/src/bigdl/llm/utils/convert_chatglm.py
+++ b/python/llm/src/bigdl/llm/utils/convert_chatglm.py
@@ -261,6 +261,7 @@ class BaseConverter:
             cls.dump_model(f, model, ggml_type)
 
         print(f"{cls.MODEL_TYPE.name} GGML model saved to {save_path}")
+        return save_path
 
 
 class ChatGLMConverter(BaseConverter):
@@ -397,9 +398,9 @@ def _convert_chatglm_hf_to_ggml_(model_path, outfile_dir, outtype):
     model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
 
     if hasattr(model.config, "multi_query_attention"):
-        ChatGLM2Converter.convert(model, tokenizer, ggml_type, outfile_dir)
+        return ChatGLM2Converter.convert(model, tokenizer, ggml_type, outfile_dir)
     else:
-        ChatGLMConverter.convert(model, tokenizer, ggml_type, outfile_dir)
+        return ChatGLMConverter.convert(model, tokenizer, ggml_type, outfile_dir)
 
 
 def main():
diff --git a/python/llm/src/bigdl/llm/utils/convert_util.py b/python/llm/src/bigdl/llm/utils/convert_util.py
index d4b0e38c..8acfd541 100644
--- a/python/llm/src/bigdl/llm/utils/convert_util.py
+++ b/python/llm/src/bigdl/llm/utils/convert_util.py
@@ -1596,6 +1596,6 @@ def _convert_chatglm_hf_to_ggml(model_path, outfile_dir, outtype):
                       "For now we only support quantization type 'q4_0' and 'q4_1' "
                       "in chatglm family.")
     from bigdl.llm.utils.convert_chatglm import _convert_chatglm_hf_to_ggml_
-    _convert_chatglm_hf_to_ggml_(model_path,
-                                 outfile,
-                                 outtype)
+    return _convert_chatglm_hf_to_ggml_(model_path,
+                                        outfile,
+                                        outtype)