From be2ae6eb7cabd40b53a29769017aabd0f92d289e Mon Sep 17 00:00:00 2001
From: binbin Deng <108676127+plusbang@users.noreply.github.com>
Date: Mon, 14 Aug 2023 17:23:33 +0800
Subject: [PATCH] LLM: fix langchain native int4 voiceasistant example (#8750)

---
 python/llm/example/langchain/README.md                |  3 ++-
 .../example/langchain/native_int4/voiceassistant.py   | 11 +++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/python/llm/example/langchain/README.md b/python/llm/example/langchain/README.md
index 22340524..9764c2d0 100644
--- a/python/llm/example/langchain/README.md
+++ b/python/llm/example/langchain/README.md
@@ -58,13 +58,14 @@ pip install soundfile
 ```
 
 ```bash
-python native_int4/voiceassistant.py -x MODEL_FAMILY -m CONVERTED_MODEL_PATH -t THREAD_NUM
+python native_int4/voiceassistant.py -x MODEL_FAMILY -m CONVERTED_MODEL_PATH -t THREAD_NUM -c CONTEXT_SIZE
 ```
 
 arguments info:
 - `-m CONVERTED_MODEL_PATH`: **required**, path to the converted model
 - `-x MODEL_FAMILY`: **required**, the model family of the model specified in `-m`, available options are `llama`, `gptneox` and `bloom`
 - `-t THREAD_NUM`: specify the number of threads to use for inference. Default is `2`.
+- `-c CONTEXT_SIZE`: specify maximum context size. Default to be 512.
 
 When you see output says
 > listening now...
diff --git a/python/llm/example/langchain/native_int4/voiceassistant.py b/python/llm/example/langchain/native_int4/voiceassistant.py
index 0d275549..f8ce8e8a 100644
--- a/python/llm/example/langchain/native_int4/voiceassistant.py
+++ b/python/llm/example/langchain/native_int4/voiceassistant.py
@@ -37,10 +37,13 @@ def prepare_chain(args):
     model_path = args.model_path
     model_family = args.model_family
     n_threads = args.thread_num
+    n_ctx = args.context_size
 
     # Use a easy prompt could bring good-enough result
+    # You could tune the prompt based on your own model to perform better
     template = """
     {history}
+
     Q: {human_input}
     A:"""
     prompt = PromptTemplate(input_variables=["history", "human_input"], template=template)
@@ -51,8 +54,10 @@ def prepare_chain(args):
             model_path=model_path,
             model_family=model_family,
             n_threads=n_threads,
-            callback_manager=callback_manager, 
-            verbose=True
+            callback_manager=callback_manager,
+            verbose=True,
+            n_ctx=n_ctx,
+            stop=['\n\n'] # You could tune the stop words based on your own model to perform better
         )
 
     # Following code are complete the same as the use-case
@@ -116,6 +121,8 @@ if __name__ == '__main__':
                         help='the path to the converted llm model')
     parser.add_argument('-t','--thread-num', type=int, default=2,
                         help='Number of threads to use for inference')
+    parser.add_argument('-c','--context-size', type=int, default=512,
+                        help='Maximum context size')
     args = parser.parse_args()
 
     main(args)