From be2ae6eb7cabd40b53a29769017aabd0f92d289e Mon Sep 17 00:00:00 2001 From: binbin Deng <108676127+plusbang@users.noreply.github.com> Date: Mon, 14 Aug 2023 17:23:33 +0800 Subject: [PATCH] LLM: fix langchain native int4 voiceasistant example (#8750) --- python/llm/example/langchain/README.md | 3 ++- .../example/langchain/native_int4/voiceassistant.py | 11 +++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/python/llm/example/langchain/README.md b/python/llm/example/langchain/README.md index 22340524..9764c2d0 100644 --- a/python/llm/example/langchain/README.md +++ b/python/llm/example/langchain/README.md @@ -58,13 +58,14 @@ pip install soundfile ``` ```bash -python native_int4/voiceassistant.py -x MODEL_FAMILY -m CONVERTED_MODEL_PATH -t THREAD_NUM +python native_int4/voiceassistant.py -x MODEL_FAMILY -m CONVERTED_MODEL_PATH -t THREAD_NUM -c CONTEXT_SIZE ``` arguments info: - `-m CONVERTED_MODEL_PATH`: **required**, path to the converted model - `-x MODEL_FAMILY`: **required**, the model family of the model specified in `-m`, available options are `llama`, `gptneox` and `bloom` - `-t THREAD_NUM`: specify the number of threads to use for inference. Default is `2`. +- `-c CONTEXT_SIZE`: specify maximum context size. Default to be 512. When you see output says > listening now... diff --git a/python/llm/example/langchain/native_int4/voiceassistant.py b/python/llm/example/langchain/native_int4/voiceassistant.py index 0d275549..f8ce8e8a 100644 --- a/python/llm/example/langchain/native_int4/voiceassistant.py +++ b/python/llm/example/langchain/native_int4/voiceassistant.py @@ -37,10 +37,13 @@ def prepare_chain(args): model_path = args.model_path model_family = args.model_family n_threads = args.thread_num + n_ctx = args.context_size # Use a easy prompt could bring good-enough result + # You could tune the prompt based on your own model to perform better template = """ {history} + Q: {human_input} A:""" prompt = PromptTemplate(input_variables=["history", "human_input"], template=template) @@ -51,8 +54,10 @@ def prepare_chain(args): model_path=model_path, model_family=model_family, n_threads=n_threads, - callback_manager=callback_manager, - verbose=True + callback_manager=callback_manager, + verbose=True, + n_ctx=n_ctx, + stop=['\n\n'] # You could tune the stop words based on your own model to perform better ) # Following code are complete the same as the use-case @@ -116,6 +121,8 @@ if __name__ == '__main__': help='the path to the converted llm model') parser.add_argument('-t','--thread-num', type=int, default=2, help='Number of threads to use for inference') + parser.add_argument('-c','--context-size', type=int, default=512, + help='Maximum context size') args = parser.parse_args() main(args)