diff --git a/python/llm/example/CPU/LangChain/README.md b/python/llm/example/CPU/LangChain/README.md index 0d5be536..775083a2 100644 --- a/python/llm/example/CPU/LangChain/README.md +++ b/python/llm/example/CPU/LangChain/README.md @@ -18,47 +18,47 @@ pip install -U pandas==2.0.3 ### Example: Chat -The chat example ([chat.py](./transformers_int4/chat.py)) shows how to use `LLMChain` to build a chat pipeline. +The chat example ([chat.py](./chat.py)) shows how to use `LLMChain` to build a chat pipeline. To run the example, execute the following command in the current directory: ```bash -python transformers_int4/chat.py -m [-q ] +python chat.py -m [-q ] ``` > Note: if `-q` is not specified, it will use `What is AI` by default. ### Example: RAG (Retrival Augmented Generation) -The RAG example ([rag.py](./transformers_int4/rag.py)) shows how to load the input text into vector database, and then use `load_qa_chain` to build a retrival pipeline. +The RAG example ([rag.py](./rag.py)) shows how to load the input text into vector database, and then use `load_qa_chain` to build a retrival pipeline. To run the example, execute the following command in the current directory: ```bash -python transformers_int4/rag.py -m [-q ] [-i ] +python rag.py -m [-q ] [-i ] ``` > Note: If `-i` is not specified, it will use a short introduction to Big-DL as input by default. if `-q` is not specified, `What is IPEX LLM?` will be used by default. ### Example: Math -The math example ([math.py](./transformers_int4/llm_math.py)) shows how to build a chat pipeline specialized in solving math questions. For example, you can ask `What is 13 raised to the .3432 power?` +The math example ([math.py](./llm_math.py)) shows how to build a chat pipeline specialized in solving math questions. For example, you can ask `What is 13 raised to the .3432 power?` To run the exmaple, execute the following command in the current directory: ```bash -python transformers_int4/llm_math.py -m [-q ] +python llm_math.py -m [-q ] ``` > Note: if `-q` is not specified, it will use `What is 13 raised to the .3432 power?` by default. ### Example: Voice Assistant -The voice assistant example ([voiceassistant.py](./transformers_int4/voiceassistant.py)) showcases how to use langchain to build a pipeline that takes in your speech as input in realtime, use an ASR model (e.g. [Whisper-Medium](https://huggingface.co/openai/whisper-medium)) to turn speech into text, and then feed the text into large language model to get response. +The voice assistant example ([voiceassistant.py](./voiceassistant.py)) showcases how to use langchain to build a pipeline that takes in your speech as input in realtime, use an ASR model (e.g. [Whisper-Medium](https://huggingface.co/openai/whisper-medium)) to turn speech into text, and then feed the text into large language model to get response. To run the exmaple, execute the following command in the current directory: ```bash -python transformers_int4/voiceassistant.py -m [-q ] +python voiceassistant.py -m [-q ] ``` **Runtime Arguments Explained**: - `-m MODEL_PATH`: **Required**, the path to the @@ -67,6 +67,23 @@ python transformers_int4/voiceassistant.py -m [-q Note: `save_low_bit` only saves the weights of the model. +> Users could copy the tokenizer model into the target folder or specify `tokenizer_id` during initialization. +```bash +python low_bit.py -m -t [-q ] +``` +**Runtime Arguments Explained**: +- `-m MODEL_PATH`: **Required**, the path to the model +- `-t TARGET_PATH`: **Required**, the path to save the low_bit model +- `-q QUESTION`: the question + + + ### Legacy (Native INT4 examples) IPEX-LLM also provides langchain integrations using native INT4 mode. Those examples can be foud in [native_int4](./native_int4/) folder. For detailed instructions of settting up and running `native_int4` examples, refer to [Native INT4 Examples README](./README_nativeint4.md). diff --git a/python/llm/example/CPU/LangChain/low_bit.py b/python/llm/example/CPU/LangChain/low_bit.py new file mode 100644 index 00000000..f1d45ce0 --- /dev/null +++ b/python/llm/example/CPU/LangChain/low_bit.py @@ -0,0 +1,60 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import argparse +from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM +from langchain import PromptTemplate, LLMChain +from langchain import HuggingFacePipeline + + +def main(args): + question = args.question + model_path = args.model_path + low_bit_model_path = args.target_path + template ="""{question}""" + + prompt = PromptTemplate(template=template, input_variables=["question"]) + + llm = TransformersLLM.from_model_id( + model_id=model_path, + model_kwargs={"temperature": 0, "max_length": 64, "trust_remote_code": True}, + ) + llm.model.save_low_bit(low_bit_model_path) + del llm + low_bit_llm = TransformersLLM.from_model_id_low_bit( + model_id=low_bit_model_path, + tokenizer_id=model_path, + model_kwargs={"temperature": 0, "max_length": 64, "trust_remote_code": True} + ) + llm_chain = LLMChain(prompt=prompt, llm=low_bit_llm) + + output = llm_chain.run(question) + print("====output=====") + print(output) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='TransformersLLM Langchain Chat Example') + parser.add_argument('-m','--model-path', type=str, required=True, + help='the path to transformers model') + parser.add_argument('-t','--target-path',type=str,required=True, + help='the path to save the low bit model') + parser.add_argument('-q', '--question', type=str, default='What is AI?', + help='qustion you want to ask.') + args = parser.parse_args() + + main(args) \ No newline at end of file diff --git a/python/llm/example/GPU/LangChain/README.md b/python/llm/example/GPU/LangChain/README.md index 9fd9489e..3115bdaf 100644 --- a/python/llm/example/GPU/LangChain/README.md +++ b/python/llm/example/GPU/LangChain/README.md @@ -100,4 +100,19 @@ python rag.py -m [-q QUESTION] [-i INPUT_PATH] arguments info: - `-m MODEL_PATH`: **required**, path to the model. - `-q QUESTION`: question to ask. Default is `What is IPEX?`. -- `-i INPUT_PATH`: path to the input doc. \ No newline at end of file +- `-i INPUT_PATH`: path to the input doc. + + +#### 5.2. Low Bit + +The low_bit example ([low_bit.py](./low_bit.py)) showcases how to use use langchain with low_bit optimized model. +By `save_low_bit` we save the weights of low_bit model into the target folder. +> Note: `save_low_bit` only saves the weights of the model. +> Users could copy the tokenizer model into the target folder or specify `tokenizer_id` during initialization. +```bash +python low_bit.py -m -t [-q ] +``` +**Runtime Arguments Explained**: +- `-m MODEL_PATH`: **Required**, the path to the model +- `-t TARGET_PATH`: **Required**, the path to save the low_bit model +- `-q QUESTION`: the question \ No newline at end of file diff --git a/python/llm/example/GPU/LangChain/low_bit.py b/python/llm/example/GPU/LangChain/low_bit.py new file mode 100644 index 00000000..dbbda235 --- /dev/null +++ b/python/llm/example/GPU/LangChain/low_bit.py @@ -0,0 +1,64 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import argparse + +from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM +from langchain import PromptTemplate, LLMChain +from langchain import HuggingFacePipeline +from torch import device + + +def main(args): + question = args.question + model_path = args.model_path + low_bit_model_path = args.target_path + template ="""{question}""" + + prompt = PromptTemplate(template=template, input_variables=["question"]) + + llm = TransformersLLM.from_model_id( + model_id=model_path, + model_kwargs={"temperature": 0, "max_length": 64, "trust_remote_code": True}, + device_map='xpu' + ) + llm.model.save_low_bit(low_bit_model_path) + del llm + low_bit_llm = TransformersLLM.from_model_id_low_bit( + model_id=low_bit_model_path, + tokenizer_id=model_path, + device_map='xpu', + model_kwargs={"temperature": 0, "max_length": 64, "trust_remote_code": True} + ) + llm_chain = LLMChain(prompt=prompt, llm=low_bit_llm) + + output = llm_chain.run(question) + print("====output=====") + print(output) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='TransformersLLM Langchain Chat Example') + parser.add_argument('-m','--model-path', type=str, required=True, + help='the path to transformers model') + parser.add_argument('-t','--target-path',type=str,required=True, + help='the path to save the low bit model') + parser.add_argument('-q', '--question', type=str, default='What is AI?', + help='qustion you want to ask.') + args = parser.parse_args() + + main(args) \ No newline at end of file diff --git a/python/llm/src/ipex_llm/langchain/llms/transformersllm.py b/python/llm/src/ipex_llm/langchain/llms/transformersllm.py index f3498e9d..8a19a03a 100644 --- a/python/llm/src/ipex_llm/langchain/llms/transformersllm.py +++ b/python/llm/src/ipex_llm/langchain/llms/transformersllm.py @@ -48,7 +48,7 @@ import importlib.util import logging from typing import Any, List, Mapping, Optional - +from ipex_llm.utils.common.log4Error import invalidInputError from pydantic import Extra from langchain.callbacks.manager import CallbackManagerForLLMRun @@ -90,13 +90,14 @@ class TransformersLLM(LLM): model_id: str, model_kwargs: Optional[dict] = None, device_map: str = 'cpu', + tokenizer_id: str = None, **kwargs: Any, ) -> LLM: """ Construct object from model_id - + Args: - + model_id: Path for the huggingface repo id to be downloaded or the huggingface checkpoint folder. model_kwargs: Keyword arguments that will be passed to the model and tokenizer. @@ -114,21 +115,28 @@ class TransformersLLM(LLM): from transformers import AutoTokenizer, LlamaTokenizer except ImportError: - raise ValueError( + invalidInputError( "Could not import transformers python package. " "Please install it with `pip install transformers`." ) _model_kwargs = model_kwargs or {} # TODO: may refactore this code in the future - try: - tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs) - except: - tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs) + if tokenizer_id is not None: + try: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, **_model_kwargs) + except: + tokenizer = LlamaTokenizer.from_pretrained(tokenizer_id, **_model_kwargs) + else: + try: + tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs) + except: + tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs) # TODO: may refactore this code in the future try: - model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, **_model_kwargs) + model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, + **_model_kwargs) except: model = AutoModel.from_pretrained(model_id, load_in_4bit=True, **_model_kwargs) @@ -155,13 +163,12 @@ class TransformersLLM(LLM): model_id: str, model_kwargs: Optional[dict] = None, device_map: str = 'cpu', + tokenizer_id: str = None, **kwargs: Any, ) -> LLM: """ Construct low_bit object from model_id - Args: - model_id: Path for the bigdl transformers low-bit model checkpoint folder. model_kwargs: Keyword arguments that will be passed to the model and tokenizer. kwargs: Extra arguments that will be passed to the model and tokenizer. @@ -177,24 +184,29 @@ class TransformersLLM(LLM): from transformers import AutoTokenizer, LlamaTokenizer except ImportError: - raise ValueError( + invalidInputError( "Could not import transformers python package. " "Please install it with `pip install transformers`." ) _model_kwargs = model_kwargs or {} # TODO: may refactore this code in the future - try: - tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs) - except: - tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs) + if tokenizer_id is not None: + try: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, **_model_kwargs) + except: + tokenizer = LlamaTokenizer.from_pretrained(tokenizer_id, **_model_kwargs) + else: + try: + tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs) + except: + tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs) # TODO: may refactore this code in the future try: model = AutoModelForCausalLM.load_low_bit(model_id, **_model_kwargs) except: model = AutoModel.load_low_bit(model_id, **_model_kwargs) - # TODO: may refactore this code in the future if 'xpu' in device_map: model = model.to(device_map) @@ -260,5 +272,5 @@ class TransformersLLM(LLM): else: stopping_criteria = None output = self.model.generate(input_ids, stopping_criteria=stopping_criteria, **kwargs) - text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt) :] + text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):] return text