diff --git a/python/llm/example/cpp-python/README.md b/python/llm/example/cpp-python/README.md new file mode 100644 index 00000000..60d51707 --- /dev/null +++ b/python/llm/example/cpp-python/README.md @@ -0,0 +1,28 @@ +# BigDL-LLM INT4 Inference Using Llama-Cpp-Python Format API + +In this example, we show how to run inference on converted INT4 model using llama-cpp-python format API. + +> **Note**: Currently model family LLaMA, GPT-NeoX, BLOOM and StarCoder are supported. + +## Prepare Environment +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.9 +conda activate llm + +pip install --pre --upgrade bigdl-llm[all] +``` + +## Convert Models using bigdl-llm +Follow the instructions in [Convert model](https://github.com/intel-analytics/BigDL/tree/main/python/llm#convert-model). + + +## Run the example +```bash +python ./int4_inference.py -m CONVERTED_MODEL_PATH -x MODEL_FAMILY -p PROMPT -t THREAD_NUM +``` +arguments info: +- `-m CONVERTED_MODEL_PATH`: **required**, path to the converted model +- `-x MODEL_FAMILY`: **required**, the model family of the model specified in `-m`, available options are `llama`, `gptneox`, `bloom` and `starcoder` +- `-p PROMPT`: question to ask. Default is `What is AI?`. +- `-t THREAD_NUM`: specify the number of threads to use for inference. Default is `2`. diff --git a/python/llm/example/simple.py b/python/llm/example/cpp-python/int4_inference.py similarity index 91% rename from python/llm/example/simple.py rename to python/llm/example/cpp-python/int4_inference.py index 9e7b8ee1..b7edcb68 100644 --- a/python/llm/example/simple.py +++ b/python/llm/example/cpp-python/int4_inference.py @@ -36,6 +36,9 @@ def main(args): if model_family == "gptneox": from bigdl.llm.models import Gptneox modelclass = Gptneox + if model_family == "starcoder": + from bigdl.llm.models import Starcoder + modelclass = Starcoder model = modelclass(model_path, n_threads=n_threads) response=model(prompt) @@ -44,6 +47,7 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Llama-CPP-Python style API Simple Example') parser.add_argument('-x','--model-family', type=str, required=True, + choices=["llama", "bloom", "gptneox", "starcoder"], help='the model family') parser.add_argument('-m','--model-path', type=str, required=True, help='the path to the converted llm model') @@ -53,4 +57,4 @@ if __name__ == '__main__': help='number of threads to use for inference') args = parser.parse_args() - main(args) \ No newline at end of file + main(args) diff --git a/python/llm/example/langchain/README.md b/python/llm/example/langchain/README.md index c41084a3..88591cb6 100644 --- a/python/llm/example/langchain/README.md +++ b/python/llm/example/langchain/README.md @@ -3,7 +3,7 @@ The examples here shows how to use langchain with `bigdl-llm`. ## Install bigdl-llm -Follow the instructions in [bigdl-llm docs: Install](). +Follow the instructions in [Install](https://github.com/intel-analytics/BigDL/tree/main/python/llm#install). ## Install Required Dependencies for langchain examples. @@ -17,7 +17,7 @@ Note that typing_extensions==4.5.0 is required, or you may encounter error `Type ## Convert Models using bigdl-llm -Follow the instructions in [bigdl-llm docs: Convert Models](). +Follow the instructions in [Convert model](https://github.com/intel-analytics/BigDL/tree/main/python/llm#convert-model). ## Run the examples @@ -25,22 +25,22 @@ Follow the instructions in [bigdl-llm docs: Convert Models](). ### 1. Streaming Chat ```bash -python ./streamchat.py -m MODEL_PATH -x MODEL_FAMILY -t THREAD_NUM -q "What is AI?" +python ./streamchat.py -m CONVERTED_MODEL_PATH -x MODEL_FAMILY -q QUESTION -t THREAD_NUM ``` arguments info: -- `-m MODEL_PATH`: path to the converted model -- `-x MODEL_FAMILY`: the model family of the model specified in `-m`, available options are `llama`, `gptneox` -- `-q QUESTION `: question to ask. Default is `What is AI?`. -- `-t THREAD_NUM`: required argument defining the number of threads to use for inference. Default is `2`. +- `-m CONVERTED_MODEL_PATH`: **required**, path to the converted model +- `-x MODEL_FAMILY`: **required**, the model family of the model specified in `-m`, available options are `llama`, `gptneox` and `bloom` +- `-q QUESTION`: question to ask. Default is `What is AI?`. +- `-t THREAD_NUM`: specify the number of threads to use for inference. Default is `2`. ### 2. Question Answering over Docs ```bash -python ./docqa.py --t THREAD_NUM -m -x +python ./docqa.py -m CONVERTED_MODEL_PATH -x MODEL_FAMILY -i DOC_PATH -q QUESTION -c CONTEXT_SIZE -t THREAD_NUM ``` arguments info: -- `-m CONVERTED_MODEL_PATH`: path to the converted model in above step -- `-x MODEL_FAMILY`: the model family of the model specified in `-m`, available options are `llama`, `gptneox` -- `-q QUESTION `: question to ask, default question is `What is AI?`. -- `-t THREAD_NUM`: required argument defining the number of threads to use for inference. Default is `2`. - - +- `-m CONVERTED_MODEL_PATH`: **required**, path to the converted model in above step +- `-x MODEL_FAMILY`: **required**, the model family of the model specified in `-m`, available options are `llama`, `gptneox` and `bloom` +- `-i DOC_PATH`: **required**, path to the input document +- `-q QUESTION`: question to ask. Default is `What is AI?`. +- `-c CONTEXT_SIZE`: specify the maximum context size. Default is `2048`. +- `-t THREAD_NUM`: specify the number of threads to use for inference. Default is `2`. diff --git a/python/llm/example/langchain/docqa.py b/python/llm/example/langchain/docqa.py index 839edd53..7f508fb8 100644 --- a/python/llm/example/langchain/docqa.py +++ b/python/llm/example/langchain/docqa.py @@ -71,17 +71,18 @@ def main(args): if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Llama-CPP-Python style API Simple Example') + parser = argparse.ArgumentParser(description='BigDL-LLM Langchain Question Answering over Docs Example') parser.add_argument('-x','--model-family', type=str, required=True, + choices=["llama", "bloom", "gptneox"], help='the model family') parser.add_argument('-m','--model-path', type=str, required=True, help='the path to the converted llm model') - parser.add_argument('-i', '--input-path', type=str, + parser.add_argument('-i', '--input-path', type=str, required=True, help='the path to the input doc.') parser.add_argument('-q', '--question', type=str, default='What is AI?', help='qustion you want to ask.') parser.add_argument('-c','--n-ctx', type=int, default=2048, - help='number of threads to use for inference') + help='the maximum context size') parser.add_argument('-t','--thread-num', type=int, default=2, help='number of threads to use for inference') args = parser.parse_args() diff --git a/python/llm/example/langchain/streamchat.py b/python/llm/example/langchain/streamchat.py index 4ee80211..b070a642 100644 --- a/python/llm/example/langchain/streamchat.py +++ b/python/llm/example/langchain/streamchat.py @@ -56,8 +56,9 @@ def main(args): if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Llama-CPP-Python style API Simple Example') + parser = argparse.ArgumentParser(description='BigDL-LLM Langchain Streaming Chat Example') parser.add_argument('-x','--model-family', type=str, required=True, + choices=["llama", "bloom", "gptneox"], help='the model family') parser.add_argument('-m','--model-path', type=str, required=True, help='the path to the converted llm model') diff --git a/python/llm/example/transformers/native_int4_pipeline.py b/python/llm/example/transformers/native_int4_pipeline.py index 520e455c..1d0b5856 100644 --- a/python/llm/example/transformers/native_int4_pipeline.py +++ b/python/llm/example/transformers/native_int4_pipeline.py @@ -95,6 +95,7 @@ def main(): parser.add_argument('--thread-num', type=int, default=2, required=True, help='Number of threads to use for inference') parser.add_argument('--model-family', type=str, default='llama', required=True, + choices=["llama", "bloom", "gptneox", "starcoder"], help="The model family of the large language model (supported option: 'llama', " "'gptneox', 'bloom', 'starcoder')") parser.add_argument('--repo-id-or-model-path', type=str, required=True,