diff --git a/python/llm/src/bigdl/llm/ggml/model/generation/utils.py b/python/llm/src/bigdl/llm/ggml/model/generation/utils.py
index 7aa0cee0..5e8da4f0 100644
--- a/python/llm/src/bigdl/llm/ggml/model/generation/utils.py
+++ b/python/llm/src/bigdl/llm/ggml/model/generation/utils.py
@@ -31,6 +31,30 @@ class GenerationMixin:
 
     Pass custom parameter values to 'generate' .
     """
+    def tokenize(self, text: str, add_bos: bool = True) -> List[int]:
+        '''
+        Decode the id to words
+
+        :param text: The text to be tokenized
+        :param add_bos:
+
+        :return: list of ids that indicates the tokens
+        '''
+        if isinstance(text, str):
+            bstr = text.encode()
+        else:
+            bstr = text
+        return self._tokenize(bstr, add_bos)
+
+    def decode(self, tokens: List[int]) -> str:
+        '''
+        Decode the id to words
+
+        :param tokens: list of ids that indicates the tokens, mostly generated by generate
+        :return: decoded string
+        '''
+        return self.detokenize(tokens).decode()
+
     def generate(
         self,
         inputs: Union[Optional[Sequence[int]], Sequence[gptneox_cpp.gptneox_token]]=None,
@@ -46,18 +70,18 @@ class GenerationMixin:
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        stop: Optional[Union[str, List[str]]]=[],
+        stop: Optional[Union[str, List[str]]]=[],  # TODO: rebase to support stopping_criteria
         **kwargs,
     ) -> Union[Optional[Sequence[int]], Optional[Sequence[gptneox_cpp.gptneox_token]], None]:
         # TODO: modify docs
         """Create a generator of tokens from a prompt.
 
         Examples:
-            >>> llama = Llama("models/ggml-7b.bin")
-            >>> tokens = llama.tokenize(b"Hello, world!")
-            >>> for token in llama.generate(tokens, top_k=40, top_p=0.95,
-            >>>                             temp=1.0, repeat_penalty=1.1):
-            ...     print(llama.detokenize([token]))
+            >>> llm = AutoModelForCausalLM.from_pretrained("gpt4all-model-q4_0.bin",
+                                                           model_family="llama")
+            >>> tokens = llm.tokenize("Q: Tell me something about Intel. A:")
+            >>> tokens_id = llm.generate(tokens, max_new_tokens=32)
+            >>> llm.decode(tokens_id)
 
         Args:
             tokens: The prompt tokens.
@@ -70,17 +94,24 @@ class GenerationMixin:
         Yields:
             The generated tokens.
         """
-        # TODO: stop & max_token
-        self._generate(tokens=inputs,
-                       top_k=top_k,
-                       top_p=top_p,
-                       temp=temperature,
-                       repeat_penalty=repetition_penalty,
-                       reset=reset,
-                       frequency_penalty=frequency_penalty,
-                       presence_penalty=presence_penalty,
-                       tfs_z=tfs_z,
-                       mirostat_mode=mirostat_mode,
-                       mirostat_tau=mirostat_tau,
-                       mirostat_eta=mirostat_eta,
-                       **kwargs)
+        tokens = self._generate(tokens=inputs,
+                                top_k=top_k,
+                                top_p=top_p,
+                                temp=temperature,
+                                repeat_penalty=repetition_penalty,
+                                reset=reset,
+                                frequency_penalty=frequency_penalty,
+                                presence_penalty=presence_penalty,
+                                tfs_z=tfs_z,
+                                mirostat_mode=mirostat_mode,
+                                mirostat_tau=mirostat_tau,
+                                mirostat_eta=mirostat_eta,
+                                **kwargs)
+        res_list = []
+        word_count = 0
+        for token in tokens:
+            if word_count > max_new_tokens:
+                break
+            res_list.append(token)
+            word_count += 1
+        return res_list
diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py b/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py
index 93a09c1b..1c9a5e3c 100644
--- a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py
+++ b/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py
@@ -225,7 +225,7 @@ class Gptneox:
         if self.verbose:
             print(gptneox_cpp.gptneox_print_system_info().decode("utf-8"), file=sys.stderr)
 
-    def tokenize(
+    def _tokenize(
         self, text: bytes, add_bos: bool = True
     ) -> List[gptneox_cpp.gptneox_token]:
         """Tokenize a string.
diff --git a/python/llm/src/bigdl/llm/ggml/model/llama/llama.py b/python/llm/src/bigdl/llm/ggml/model/llama/llama.py
index 7ae0bb00..99a81d3d 100644
--- a/python/llm/src/bigdl/llm/ggml/model/llama/llama.py
+++ b/python/llm/src/bigdl/llm/ggml/model/llama/llama.py
@@ -250,7 +250,7 @@ class Llama(GenerationMixin):
         self._token_nl = Llama.token_nl()
         self._token_eos = Llama.token_eos()
 
-    def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
+    def _tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
         """Tokenize a string.
 
         Args: