Refactor bigdl.llm to ipex_llm (#24)
* Rename bigdl/llm to ipex_llm * rm python/llm/src/bigdl * from bigdl.llm to from ipex_llm
This commit is contained in:
parent
cc5806f4bc
commit
9df70d95eb
464 changed files with 918 additions and 940 deletions
|
|
@ -86,7 +86,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
|
||||||
|
|
||||||
```python
|
```python
|
||||||
#load Hugging Face Transformers model with INT4 optimizations
|
#load Hugging Face Transformers model with INT4 optimizations
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||||
|
|
||||||
#run the optimized model on CPU
|
#run the optimized model on CPU
|
||||||
|
|
@ -113,7 +113,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
|
||||||
|
|
||||||
```python
|
```python
|
||||||
#load Hugging Face Transformers model with INT4 optimizations
|
#load Hugging Face Transformers model with INT4 optimizations
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||||
|
|
||||||
#run the optimized model on Intel GPU
|
#run the optimized model on Intel GPU
|
||||||
|
|
|
||||||
|
|
@ -223,7 +223,7 @@ This controller manages the distributed workers.
|
||||||
|
|
||||||
##### Launch the model worker(s)
|
##### Launch the model worker(s)
|
||||||
```bash
|
```bash
|
||||||
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
|
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
|
||||||
```
|
```
|
||||||
Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
|
Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
|
||||||
|
|
||||||
|
|
@ -252,7 +252,7 @@ python3 -m fastchat.serve.controller
|
||||||
Then, launch the model worker(s):
|
Then, launch the model worker(s):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
|
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
|
||||||
```
|
```
|
||||||
|
|
||||||
Finally, launch the RESTful API server
|
Finally, launch the RESTful API server
|
||||||
|
|
@ -319,7 +319,7 @@ This controller manages the distributed workers.
|
||||||
|
|
||||||
##### Launch the model worker(s)
|
##### Launch the model worker(s)
|
||||||
```bash
|
```bash
|
||||||
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
|
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
|
||||||
```
|
```
|
||||||
Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
|
Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
|
||||||
|
|
||||||
|
|
@ -346,7 +346,7 @@ python3 -m fastchat.serve.controller
|
||||||
Then, launch the model worker(s):
|
Then, launch the model worker(s):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
|
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
|
||||||
```
|
```
|
||||||
|
|
||||||
Finally, launch the RESTful API server
|
Finally, launch the RESTful API server
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@ from transformers import TextIteratorStreamer
|
||||||
from transformers.tools.agents import StopSequenceCriteria
|
from transformers.tools.agents import StopSequenceCriteria
|
||||||
from transformers.generation.stopping_criteria import StoppingCriteriaList
|
from transformers.generation.stopping_criteria import StoppingCriteriaList
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
SYSTEM_PROMPT = "A chat between a curious human <human> and an artificial intelligence assistant <bot>.\
|
SYSTEM_PROMPT = "A chat between a curious human <human> and an artificial intelligence assistant <bot>.\
|
||||||
The assistant gives helpful, detailed, and polite answers to the human's questions."
|
The assistant gives helpful, detailed, and polite answers to the human's questions."
|
||||||
HUMAN_ID = "<human>"
|
HUMAN_ID = "<human>"
|
||||||
|
|
|
||||||
|
|
@ -135,9 +135,9 @@ else
|
||||||
done
|
done
|
||||||
|
|
||||||
if [ "$worker_type" == "model_worker" ]; then
|
if [ "$worker_type" == "model_worker" ]; then
|
||||||
worker_type="bigdl.llm.serving.model_worker"
|
worker_type="ipex_llm.serving.model_worker"
|
||||||
elif [ "$worker_type" == "vllm_worker" ]; then
|
elif [ "$worker_type" == "vllm_worker" ]; then
|
||||||
worker_type="bigdl.llm.serving.vllm_worker"
|
worker_type="ipex_llm.serving.vllm_worker"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ -n $CONTROLLER_HOST ]]; then
|
if [[ -n $CONTROLLER_HOST ]]; then
|
||||||
|
|
@ -220,9 +220,9 @@ else
|
||||||
echo "Worker type: $worker_type"
|
echo "Worker type: $worker_type"
|
||||||
echo "Worker address: $worker_address"
|
echo "Worker address: $worker_address"
|
||||||
echo "Controller address: $controller_address"
|
echo "Controller address: $controller_address"
|
||||||
if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then
|
if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
|
||||||
python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
|
python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
|
||||||
elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then
|
elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
|
||||||
python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
|
python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@
|
||||||
generation_config = GenerationConfig.from_pretrained(
|
generation_config = GenerationConfig.from_pretrained(
|
||||||
model_path, trust_remote_code=True
|
model_path, trust_remote_code=True
|
||||||
)
|
)
|
||||||
+ from bigdl.llm.transformers import AutoModelForCausalLM
|
+ from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
model_path,
|
model_path,
|
||||||
config=config,
|
config=config,
|
||||||
|
|
|
||||||
|
|
@ -66,9 +66,9 @@ else
|
||||||
done
|
done
|
||||||
|
|
||||||
if [ "$worker_type" == "model_worker" ]; then
|
if [ "$worker_type" == "model_worker" ]; then
|
||||||
worker_type="bigdl.llm.serving.model_worker"
|
worker_type="ipex_llm.serving.model_worker"
|
||||||
elif [ "$worker_type" == "vllm_worker" ]; then
|
elif [ "$worker_type" == "vllm_worker" ]; then
|
||||||
worker_type="bigdl.llm.serving.vllm_worker"
|
worker_type="ipex_llm.serving.vllm_worker"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ -n $CONTROLLER_HOST ]]; then
|
if [[ -n $CONTROLLER_HOST ]]; then
|
||||||
|
|
@ -127,9 +127,9 @@ else
|
||||||
echo "Worker address: $worker_address"
|
echo "Worker address: $worker_address"
|
||||||
echo "Controller address: $controller_address"
|
echo "Controller address: $controller_address"
|
||||||
|
|
||||||
if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then
|
if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
|
||||||
python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
|
python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
|
||||||
elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then
|
elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
|
||||||
python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
|
python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@ To help you better understand the finetuning process, here we use model [Llama-2
|
||||||
First, load model using `transformers`-style API and **set it to `to('xpu')`**. We specify `load_in_low_bit="nf4"` here to apply 4-bit NormalFloat optimization. According to the [QLoRA paper](https://arxiv.org/pdf/2305.14314.pdf), using `"nf4"` could yield better model quality than `"int4"`.
|
First, load model using `transformers`-style API and **set it to `to('xpu')`**. We specify `load_in_low_bit="nf4"` here to apply 4-bit NormalFloat optimization. According to the [QLoRA paper](https://arxiv.org/pdf/2305.14314.pdf), using `"nf4"` could yield better model quality than `"int4"`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",
|
||||||
load_in_low_bit="nf4",
|
load_in_low_bit="nf4",
|
||||||
|
|
@ -33,14 +33,14 @@ model = model.to('xpu')
|
||||||
|
|
||||||
Then, we have to apply some preprocessing to the model to prepare it for training.
|
Then, we have to apply some preprocessing to the model to prepare it for training.
|
||||||
```python
|
```python
|
||||||
from bigdl.llm.transformers.qlora import prepare_model_for_kbit_training
|
from ipex_llm.transformers.qlora import prepare_model_for_kbit_training
|
||||||
model.gradient_checkpointing_enable()
|
model.gradient_checkpointing_enable()
|
||||||
model = prepare_model_for_kbit_training(model)
|
model = prepare_model_for_kbit_training(model)
|
||||||
```
|
```
|
||||||
|
|
||||||
Next, we can obtain a Peft model from the optimized model and a configuration object containing the parameters as follows:
|
Next, we can obtain a Peft model from the optimized model and a configuration object containing the parameters as follows:
|
||||||
```python
|
```python
|
||||||
from bigdl.llm.transformers.qlora import get_peft_model
|
from ipex_llm.transformers.qlora import get_peft_model
|
||||||
from peft import LoraConfig
|
from peft import LoraConfig
|
||||||
config = LoraConfig(r=8,
|
config = LoraConfig(r=8,
|
||||||
lora_alpha=32,
|
lora_alpha=32,
|
||||||
|
|
@ -54,7 +54,7 @@ model = get_peft_model(model, config)
|
||||||
```eval_rst
|
```eval_rst
|
||||||
.. important::
|
.. important::
|
||||||
|
|
||||||
Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``bigdl.llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``.
|
Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``ipex_llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``.
|
||||||
```
|
```
|
||||||
|
|
||||||
```eval_rst
|
```eval_rst
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# load Hugging Face Transformers model with INT4 optimizations
|
# load Hugging Face Transformers model with INT4 optimizations
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
|
||||||
|
|
||||||
# Take Llama-2-7b-chat-hf as an example
|
# Take Llama-2-7b-chat-hf as an example
|
||||||
from transformers import LlamaForCausalLM
|
from transformers import LlamaForCausalLM
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
|
|
||||||
model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True)
|
model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True)
|
||||||
model = optimize_model(model) # With only one line to enable BigDL-LLM INT4 optimization
|
model = optimize_model(model) # With only one line to enable BigDL-LLM INT4 optimization
|
||||||
|
|
@ -40,14 +40,14 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
|
||||||
|
|
||||||
When running LLMs on Intel iGPUs for Windows users, we recommend setting ``cpu_embedding=True`` in the ``optimize_model`` function. This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
|
When running LLMs on Intel iGPUs for Windows users, we recommend setting ``cpu_embedding=True`` in the ``optimize_model`` function. This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
|
||||||
|
|
||||||
See the `API doc <../../../PythonAPI/LLM/optimize.html#bigdl.llm.optimize_model>`_ for ``optimize_model`` to find more information.
|
See the `API doc <../../../PythonAPI/LLM/optimize.html#ipex_llm.optimize_model>`_ for ``optimize_model`` to find more information.
|
||||||
|
|
||||||
Especially, if you have saved the optimized model following setps `here <./optimize_model.html#save>`_, the loading process on Intel GPUs maybe as follows:
|
Especially, if you have saved the optimized model following setps `here <./optimize_model.html#save>`_, the loading process on Intel GPUs maybe as follows:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
from transformers import LlamaForCausalLM
|
from transformers import LlamaForCausalLM
|
||||||
from bigdl.llm.optimize import low_memory_init, load_low_bit
|
from ipex_llm.optimize import low_memory_init, load_low_bit
|
||||||
|
|
||||||
saved_dir='./llama-2-bigdl-llm-4-bit'
|
saved_dir='./llama-2-bigdl-llm-4-bit'
|
||||||
with low_memory_init(): # Fast and low cost by loading model on meta device
|
with low_memory_init(): # Fast and low cost by loading model on meta device
|
||||||
|
|
@ -65,7 +65,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
# Take Llama-2-7b-chat-hf as an example
|
# Take Llama-2-7b-chat-hf as an example
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
# Load model in 4 bit, which convert the relevant layers in the model into INT4 format
|
# Load model in 4 bit, which convert the relevant layers in the model into INT4 format
|
||||||
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', load_in_4bit=True)
|
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', load_in_4bit=True)
|
||||||
|
|
@ -82,7 +82,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
saved_dir='./llama-2-bigdl-llm-4-bit'
|
saved_dir='./llama-2-bigdl-llm-4-bit'
|
||||||
model = AutoModelForCausalLM.load_low_bit(saved_dir) # Load the optimized model
|
model = AutoModelForCausalLM.load_low_bit(saved_dir) # Load the optimized model
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
|
||||||
You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
|
You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from bigdl.llm.langchain.llms import TransformersLLM
|
from ipex_llm.langchain.llms import TransformersLLM
|
||||||
from bigdl.llm.langchain.embeddings import TransformersEmbeddings
|
from ipex_llm.langchain.embeddings import TransformersEmbeddings
|
||||||
from langchain.chains.question_answering import load_qa_chain
|
from langchain.chains.question_answering import load_qa_chain
|
||||||
|
|
||||||
embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
|
embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
|
||||||
|
|
@ -37,8 +37,8 @@ You may also convert Hugging Face *Transformers* models into native INT4 format,
|
||||||
```
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from bigdl.llm.langchain.llms import LlamaLLM
|
from ipex_llm.langchain.llms import LlamaLLM
|
||||||
from bigdl.llm.langchain.embeddings import LlamaEmbeddings
|
from ipex_llm.langchain.embeddings import LlamaEmbeddings
|
||||||
from langchain.chains.question_answering import load_qa_chain
|
from langchain.chains.question_answering import load_qa_chain
|
||||||
|
|
||||||
# switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
|
# switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
|
||||||
|
|
|
||||||
|
|
@ -10,13 +10,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 format
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# convert the model
|
# convert the model
|
||||||
from bigdl.llm import llm_convert
|
from ipex_llm import llm_convert
|
||||||
bigdl_llm_path = llm_convert(model='/path/to/model/',
|
bigdl_llm_path = llm_convert(model='/path/to/model/',
|
||||||
outfile='/path/to/output/', outtype='int4', model_family="llama")
|
outfile='/path/to/output/', outtype='int4', model_family="llama")
|
||||||
|
|
||||||
# load the converted model
|
# load the converted model
|
||||||
# switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
|
# switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
|
||||||
from bigdl.llm.transformers import LlamaForCausalLM
|
from ipex_llm.transformers import LlamaForCausalLM
|
||||||
llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
|
llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
|
||||||
|
|
||||||
# run the converted model
|
# run the converted model
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_
|
||||||
|
|
||||||
Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default:
|
Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default:
|
||||||
```python
|
```python
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM INT4 optimization
|
# With only one line to enable BigDL-LLM INT4 optimization
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
@ -31,7 +31,7 @@ Currently, ``low_bit`` supports options 'sym_int4', 'asym_int4', 'sym_int5', 'as
|
||||||
You may apply symmetric INT8 optimization as follows:
|
You may apply symmetric INT8 optimization as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
|
|
||||||
# Apply symmetric INT8 optimization
|
# Apply symmetric INT8 optimization
|
||||||
model = optimize_model(model, low_bit="sym_int8")
|
model = optimize_model(model, low_bit="sym_int8")
|
||||||
|
|
@ -51,7 +51,7 @@ model.save_low_bit(saved_dir)
|
||||||
|
|
||||||
We recommend to use the context manager `low_memory_init` to quickly initiate a model instance with low cost, and then use `load_low_bit` to load the optimized low-bit model as follows:
|
We recommend to use the context manager `low_memory_init` to quickly initiate a model instance with low cost, and then use `load_low_bit` to load the optimized low-bit model as follows:
|
||||||
```python
|
```python
|
||||||
from bigdl.llm.optimize import low_memory_init, load_low_bit
|
from ipex_llm.optimize import low_memory_init, load_low_bit
|
||||||
with low_memory_init(): # Fast and low cost by loading model on meta device
|
with low_memory_init(): # Fast and low cost by loading model on meta device
|
||||||
model = LlamaForCausalLM.from_pretrained(saved_dir,
|
model = LlamaForCausalLM.from_pretrained(saved_dir,
|
||||||
torch_dtype="auto",
|
torch_dtype="auto",
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ Here, let's take a relatively small LLM model, i.e [open_llama_3b_v2](https://hu
|
||||||
Simply use one-line `transformers`-style API in `bigdl-llm` to load `open_llama_3b_v2` with INT4 optimization (by specifying `load_in_4bit=True`) as follows:
|
Simply use one-line `transformers`-style API in `bigdl-llm` to load `open_llama_3b_v2` with INT4 optimization (by specifying `load_in_4bit=True`) as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="openlm-research/open_llama_3b_v2",
|
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="openlm-research/open_llama_3b_v2",
|
||||||
load_in_4bit=True)
|
load_in_4bit=True)
|
||||||
|
|
|
||||||
|
|
@ -112,7 +112,7 @@ Install the Miniconda as follows if you don't have conda installed on your machi
|
||||||
|
|
||||||
python
|
python
|
||||||
|
|
||||||
> from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
> from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
```
|
```
|
||||||
|
|
||||||
> <img src="https://llm-assets.readthedocs.io/en/latest/_images/verify_bigdl_import.png" alt="image-20240221102252562" width=100%; />
|
> <img src="https://llm-assets.readthedocs.io/en/latest/_images/verify_bigdl_import.png" alt="image-20240221102252562" width=100%; />
|
||||||
|
|
@ -170,7 +170,7 @@ Now let's play with a real LLM. We'll be using the [phi-1.5](https://huggingface
|
||||||
```python
|
```python
|
||||||
# Copy/Paste the contents to a new file demo.py
|
# Copy/Paste the contents to a new file demo.py
|
||||||
import torch
|
import torch
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GenerationConfig
|
from transformers import AutoTokenizer, GenerationConfig
|
||||||
generation_config = GenerationConfig(use_cache = True)
|
generation_config = GenerationConfig(use_cache = True)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -130,7 +130,7 @@ You can verify if `bigdl-llm` is successfully installed by simply running a few
|
||||||
* Step 5: Copy following code to Anaconda prompt **line by line** and press Enter **after copying each line**.
|
* Step 5: Copy following code to Anaconda prompt **line by line** and press Enter **after copying each line**.
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
|
||||||
tensor_1 = torch.randn(1, 1, 40, 128).to('xpu')
|
tensor_1 = torch.randn(1, 1, 40, 128).to('xpu')
|
||||||
tensor_2 = torch.randn(1, 1, 128, 40).to('xpu')
|
tensor_2 = torch.randn(1, 1, 128, 40).to('xpu')
|
||||||
print(torch.matmul(tensor_1, tensor_2).size())
|
print(torch.matmul(tensor_1, tensor_2).size())
|
||||||
|
|
@ -200,7 +200,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
|
||||||
|
|
||||||
# Copy/Paste the contents to a new file demo.py
|
# Copy/Paste the contents to a new file demo.py
|
||||||
import torch
|
import torch
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GenerationConfig
|
from transformers import AutoTokenizer, GenerationConfig
|
||||||
generation_config = GenerationConfig(use_cache=True)
|
generation_config = GenerationConfig(use_cache=True)
|
||||||
|
|
||||||
|
|
@ -260,7 +260,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
|
||||||
|
|
||||||
# Copy/Paste the contents to a new file demo.py
|
# Copy/Paste the contents to a new file demo.py
|
||||||
import torch
|
import torch
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import GenerationConfig
|
from transformers import GenerationConfig
|
||||||
from modelscope import AutoTokenizer
|
from modelscope import AutoTokenizer
|
||||||
generation_config = GenerationConfig(use_cache=True)
|
generation_config = GenerationConfig(use_cache=True)
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im
|
||||||
|
|
||||||
.. tab:: AutoModel
|
.. tab:: AutoModel
|
||||||
|
|
||||||
.. automodule:: bigdl.llm.langchain.llms.transformersllm
|
.. automodule:: ipex_llm.langchain.llms.transformersllm
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -21,7 +21,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im
|
||||||
|
|
||||||
.. tab:: pipeline
|
.. tab:: pipeline
|
||||||
|
|
||||||
.. automodule:: bigdl.llm.langchain.llms.transformerspipelinellm
|
.. automodule:: ipex_llm.langchain.llms.transformerspipelinellm
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -37,7 +37,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
||||||
|
|
||||||
.. tab:: Llama
|
.. tab:: Llama
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.LlamaLLM
|
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.LlamaLLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -49,7 +49,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
||||||
|
|
||||||
.. tab:: ChatGLM
|
.. tab:: ChatGLM
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.ChatGLMLLM
|
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.ChatGLMLLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -61,7 +61,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
||||||
|
|
||||||
.. tab:: Bloom
|
.. tab:: Bloom
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.BloomLLM
|
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.BloomLLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -73,7 +73,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
||||||
|
|
||||||
.. tab:: Gptneox
|
.. tab:: Gptneox
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.GptneoxLLM
|
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.GptneoxLLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -85,7 +85,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
||||||
|
|
||||||
.. tab:: Starcoder
|
.. tab:: Starcoder
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.StarcoderLLM
|
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.StarcoderLLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -102,7 +102,7 @@ Embeddings Wrapper of LangChain
|
||||||
Hugging Face ``transformers`` AutoModel
|
Hugging Face ``transformers`` AutoModel
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. automodule:: bigdl.llm.langchain.embeddings.transformersembeddings
|
.. automodule:: ipex_llm.langchain.embeddings.transformersembeddings
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -117,7 +117,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
|
||||||
|
|
||||||
.. tab:: Llama
|
.. tab:: Llama
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.LlamaEmbeddings
|
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.LlamaEmbeddings
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -129,7 +129,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
|
||||||
|
|
||||||
.. tab:: Bloom
|
.. tab:: Bloom
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.BloomEmbeddings
|
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.BloomEmbeddings
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -141,7 +141,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
|
||||||
|
|
||||||
.. tab:: Gptneox
|
.. tab:: Gptneox
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings
|
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -153,7 +153,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
|
||||||
|
|
||||||
.. tab:: Starcoder
|
.. tab:: Starcoder
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings
|
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ Optimize Model
|
||||||
|
|
||||||
You can run any PyTorch model with ``optimize_model`` through only one-line code change to benefit from BigDL-LLM optimization, regardless of the library or API you are using.
|
You can run any PyTorch model with ``optimize_model`` through only one-line code change to benefit from BigDL-LLM optimization, regardless of the library or API you are using.
|
||||||
|
|
||||||
.. automodule:: bigdl.llm
|
.. automodule:: ipex_llm
|
||||||
:members: optimize_model
|
:members: optimize_model
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -18,7 +18,7 @@ Load Optimized Model
|
||||||
|
|
||||||
To avoid high resource consumption during the loading processes of the original model, we provide save/load API to support the saving of model after low-bit optimization and the loading of the saved low-bit model. Saving and loading operations are platform-independent, regardless of their operating systems.
|
To avoid high resource consumption during the loading processes of the original model, we provide save/load API to support the saving of model after low-bit optimization and the loading of the saved low-bit model. Saving and loading operations are platform-independent, regardless of their operating systems.
|
||||||
|
|
||||||
.. automodule:: bigdl.llm.optimize
|
.. automodule:: ipex_llm.optimize
|
||||||
:members: load_low_bit
|
:members: load_low_bit
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ You can apply BigDL-LLM optimizations on any Hugging Face Transformers models by
|
||||||
AutoModelForCausalLM
|
AutoModelForCausalLM
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.transformers.AutoModelForCausalLM
|
.. autoclass:: ipex_llm.transformers.AutoModelForCausalLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -22,7 +22,7 @@ AutoModelForCausalLM
|
||||||
AutoModel
|
AutoModel
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.transformers.AutoModel
|
.. autoclass:: ipex_llm.transformers.AutoModel
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -34,7 +34,7 @@ AutoModel
|
||||||
AutoModelForSpeechSeq2Seq
|
AutoModelForSpeechSeq2Seq
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.transformers.AutoModelForSpeechSeq2Seq
|
.. autoclass:: ipex_llm.transformers.AutoModelForSpeechSeq2Seq
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -46,7 +46,7 @@ AutoModelForSpeechSeq2Seq
|
||||||
AutoModelForSeq2SeqLM
|
AutoModelForSeq2SeqLM
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.transformers.AutoModelForSeq2SeqLM
|
.. autoclass:: ipex_llm.transformers.AutoModelForSeq2SeqLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -67,7 +67,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
||||||
|
|
||||||
.. tab:: Llama
|
.. tab:: Llama
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.transformers.LlamaForCausalLM
|
.. autoclass:: ipex_llm.transformers.LlamaForCausalLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -77,7 +77,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
||||||
|
|
||||||
.. tab:: ChatGLM
|
.. tab:: ChatGLM
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.transformers.ChatGLMForCausalLM
|
.. autoclass:: ipex_llm.transformers.ChatGLMForCausalLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -87,7 +87,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
||||||
|
|
||||||
.. tab:: Gptneox
|
.. tab:: Gptneox
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.transformers.GptneoxForCausalLM
|
.. autoclass:: ipex_llm.transformers.GptneoxForCausalLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -96,7 +96,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
||||||
.. automethod:: from_pretrained
|
.. automethod:: from_pretrained
|
||||||
|
|
||||||
.. tab:: Bloom
|
.. tab:: Bloom
|
||||||
.. autoclass:: bigdl.llm.transformers.BloomForCausalLM
|
.. autoclass:: ipex_llm.transformers.BloomForCausalLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
@ -106,7 +106,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
||||||
|
|
||||||
.. tab:: Starcoder
|
.. tab:: Starcoder
|
||||||
|
|
||||||
.. autoclass:: bigdl.llm.transformers.StarcoderForCausalLM
|
.. autoclass:: ipex_llm.transformers.StarcoderForCausalLM
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
|
||||||
|
|
@ -112,7 +112,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
#load Hugging Face Transformers model with INT4 optimizations
|
#load Hugging Face Transformers model with INT4 optimizations
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||||
|
|
||||||
#run the optimized model on Intel CPU
|
#run the optimized model on Intel CPU
|
||||||
|
|
@ -146,7 +146,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
#load Hugging Face Transformers model with INT4 optimizations
|
#load Hugging Face Transformers model with INT4 optimizations
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||||
|
|
||||||
#run the optimized model on Intel GPU
|
#run the optimized model on Intel GPU
|
||||||
|
|
|
||||||
|
|
@ -146,7 +146,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int
|
||||||
|
|
||||||
```python
|
```python
|
||||||
#load Hugging Face Transformers model with INT4 optimizations
|
#load Hugging Face Transformers model with INT4 optimizations
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||||
|
|
||||||
#run the optimized model on Intel CPU
|
#run the optimized model on Intel CPU
|
||||||
|
|
@ -164,7 +164,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int
|
||||||
|
|
||||||
```python
|
```python
|
||||||
#load Hugging Face Transformers model with INT4 optimizations
|
#load Hugging Face Transformers model with INT4 optimizations
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
import intel_extension_for_pytorch
|
import intel_extension_for_pytorch
|
||||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||||
|
|
||||||
|
|
@ -206,13 +206,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 model f
|
||||||
|
|
||||||
```python
|
```python
|
||||||
#convert the model
|
#convert the model
|
||||||
from bigdl.llm import llm_convert
|
from ipex_llm import llm_convert
|
||||||
bigdl_llm_path = llm_convert(model='/path/to/model/',
|
bigdl_llm_path = llm_convert(model='/path/to/model/',
|
||||||
outfile='/path/to/output/', outtype='int4', model_family="llama")
|
outfile='/path/to/output/', outtype='int4', model_family="llama")
|
||||||
|
|
||||||
#load the converted model
|
#load the converted model
|
||||||
#switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
|
#switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
|
||||||
from bigdl.llm.transformers import LlamaForCausalLM
|
from ipex_llm.transformers import LlamaForCausalLM
|
||||||
llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
|
llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
|
||||||
|
|
||||||
#run the converted model
|
#run the converted model
|
||||||
|
|
@ -231,8 +231,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
|
||||||
You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
|
You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from bigdl.llm.langchain.llms import TransformersLLM
|
from ipex_llm.langchain.llms import TransformersLLM
|
||||||
from bigdl.llm.langchain.embeddings import TransformersEmbeddings
|
from ipex_llm.langchain.embeddings import TransformersEmbeddings
|
||||||
from langchain.chains.question_answering import load_qa_chain
|
from langchain.chains.question_answering import load_qa_chain
|
||||||
|
|
||||||
embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
|
embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
|
||||||
|
|
@ -250,8 +250,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
|
||||||
>**Notes**:* Currently only llama/bloom/gptneox/starcoder/chatglm model families are supported; for other models, you may use the Hugging Face `transformers` model format as described above).
|
>**Notes**:* Currently only llama/bloom/gptneox/starcoder/chatglm model families are supported; for other models, you may use the Hugging Face `transformers` model format as described above).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from bigdl.llm.langchain.llms import LlamaLLM
|
from ipex_llm.langchain.llms import LlamaLLM
|
||||||
from bigdl.llm.langchain.embeddings import LlamaEmbeddings
|
from ipex_llm.langchain.embeddings import LlamaEmbeddings
|
||||||
from langchain.chains.question_answering import load_qa_chain
|
from langchain.chains.question_answering import load_qa_chain
|
||||||
|
|
||||||
#switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
|
#switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ Just put this file into your benchmark directory, and then wrap your transformer
|
||||||
Take `chatglm-6b` as an example:
|
Take `chatglm-6b` as an example:
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from bigdl.llm.transformers import AutoModel
|
from ipex_llm.transformers import AutoModel
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from benchmark_util import BenchmarkWrapper
|
from benchmark_util import BenchmarkWrapper
|
||||||
|
|
||||||
|
|
@ -35,7 +35,7 @@ Take `chatglm-6b` as an example:
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
from bigdl.llm.transformers import AutoModel
|
from ipex_llm.transformers import AutoModel
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from benchmark_util import BenchmarkWrapper
|
from benchmark_util import BenchmarkWrapper
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -31,7 +31,7 @@ benchmark_util_path = os.path.join(current_dir, '..')
|
||||||
import sys
|
import sys
|
||||||
sys.path.append(benchmark_util_path)
|
sys.path.append(benchmark_util_path)
|
||||||
from benchmark_util import BenchmarkWrapper
|
from benchmark_util import BenchmarkWrapper
|
||||||
from bigdl.llm.utils.common.log4Error import invalidInputError
|
from ipex_llm.utils.common.log4Error import invalidInputError
|
||||||
|
|
||||||
LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
|
LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
|
||||||
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
|
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
|
||||||
|
|
@ -85,7 +85,7 @@ def run_transformer_int4(repo_id,
|
||||||
num_trials,
|
num_trials,
|
||||||
num_beams,
|
num_beams,
|
||||||
low_bit):
|
low_bit):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, LlamaTokenizer
|
from transformers import AutoTokenizer, LlamaTokenizer
|
||||||
|
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
|
|
@ -149,7 +149,7 @@ def run_transformer_int4_gpu(repo_id,
|
||||||
num_trials,
|
num_trials,
|
||||||
num_beams,
|
num_beams,
|
||||||
low_bit):
|
low_bit):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
reserved_mem_list = []
|
reserved_mem_list = []
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ benchmark_util_path = os.path.join(current_dir, '..')
|
||||||
import sys
|
import sys
|
||||||
sys.path.append(benchmark_util_path)
|
sys.path.append(benchmark_util_path)
|
||||||
from benchmark_util import BenchmarkWrapper
|
from benchmark_util import BenchmarkWrapper
|
||||||
from bigdl.llm.utils.common.log4Error import invalidInputError
|
from ipex_llm.utils.common.log4Error import invalidInputError
|
||||||
|
|
||||||
LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
|
LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
|
||||||
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
|
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
|
||||||
|
|
@ -143,8 +143,8 @@ def run_native_int4(repo_id,
|
||||||
warm_up,
|
warm_up,
|
||||||
num_trials):
|
num_trials):
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
from bigdl.llm.transformers import BigdlNativeForCausalLM
|
from ipex_llm.transformers import BigdlNativeForCausalLM
|
||||||
from bigdl.llm import llm_convert
|
from ipex_llm import llm_convert
|
||||||
if "chatglm" in repo_id.lower():
|
if "chatglm" in repo_id.lower():
|
||||||
family = "chatglm"
|
family = "chatglm"
|
||||||
elif "llama" in repo_id.lower():
|
elif "llama" in repo_id.lower():
|
||||||
|
|
@ -184,7 +184,7 @@ def run_transformer_int4(repo_id,
|
||||||
num_beams,
|
num_beams,
|
||||||
low_bit,
|
low_bit,
|
||||||
batch_size):
|
batch_size):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, LlamaTokenizer
|
from transformers import AutoTokenizer, LlamaTokenizer
|
||||||
|
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
|
|
@ -319,7 +319,7 @@ def run_optimize_model(repo_id,
|
||||||
low_bit,
|
low_bit,
|
||||||
batch_size):
|
batch_size):
|
||||||
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
|
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
|
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
# Load model in 4 bit,
|
# Load model in 4 bit,
|
||||||
|
|
@ -389,7 +389,7 @@ def run_transformer_int4_gpu(repo_id,
|
||||||
num_beams,
|
num_beams,
|
||||||
low_bit,
|
low_bit,
|
||||||
batch_size):
|
batch_size):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
|
|
@ -495,7 +495,7 @@ def run_optimize_model_gpu(repo_id,
|
||||||
low_bit,
|
low_bit,
|
||||||
batch_size):
|
batch_size):
|
||||||
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
# Load model in 4 bit,
|
# Load model in 4 bit,
|
||||||
|
|
@ -651,7 +651,7 @@ def run_bigdl_fp16_gpu(repo_id,
|
||||||
num_trials,
|
num_trials,
|
||||||
num_beams,
|
num_beams,
|
||||||
batch_size):
|
batch_size):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
|
|
@ -731,7 +731,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
|
||||||
batch_size):
|
batch_size):
|
||||||
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
|
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
|
||||||
import deepspeed
|
import deepspeed
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
import argparse
|
import argparse
|
||||||
# parser is for deepspeed subprocesses' inline parameter
|
# parser is for deepspeed subprocesses' inline parameter
|
||||||
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
|
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
|
||||||
|
|
@ -822,7 +822,7 @@ def run_transformer_int4_gpu_win(repo_id,
|
||||||
cpu_embedding,
|
cpu_embedding,
|
||||||
batch_size,
|
batch_size,
|
||||||
streaming):
|
streaming):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
|
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
|
|
@ -928,7 +928,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
|
||||||
cpu_embedding,
|
cpu_embedding,
|
||||||
batch_size,
|
batch_size,
|
||||||
streaming):
|
streaming):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
|
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
|
|
@ -1038,7 +1038,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
|
||||||
cpu_embedding,
|
cpu_embedding,
|
||||||
batch_size,
|
batch_size,
|
||||||
streaming):
|
streaming):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
|
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
|
|
@ -1140,7 +1140,7 @@ def run_transformer_autocast_bf16( repo_id,
|
||||||
num_trials,
|
num_trials,
|
||||||
num_beams,
|
num_beams,
|
||||||
batch_size):
|
batch_size):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, LlamaTokenizer
|
from transformers import AutoTokenizer, LlamaTokenizer
|
||||||
|
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
|
|
@ -1209,7 +1209,7 @@ def run_bigdl_ipex_bf16(repo_id,
|
||||||
num_trials,
|
num_trials,
|
||||||
num_beams,
|
num_beams,
|
||||||
batch_size):
|
batch_size):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, LlamaTokenizer
|
from transformers import AutoTokenizer, LlamaTokenizer
|
||||||
|
|
||||||
os.environ["BIGDL_OPT_IPEX"] = "true"
|
os.environ["BIGDL_OPT_IPEX"] = "true"
|
||||||
|
|
@ -1280,7 +1280,7 @@ def run_bigdl_ipex_int4(repo_id,
|
||||||
num_trials,
|
num_trials,
|
||||||
num_beams,
|
num_beams,
|
||||||
batch_size):
|
batch_size):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, LlamaTokenizer
|
from transformers import AutoTokenizer, LlamaTokenizer
|
||||||
|
|
||||||
os.environ["BIGDL_OPT_IPEX"] = "true"
|
os.environ["BIGDL_OPT_IPEX"] = "true"
|
||||||
|
|
@ -1350,7 +1350,7 @@ def run_bigdl_ipex_int8(repo_id,
|
||||||
num_trials,
|
num_trials,
|
||||||
num_beams,
|
num_beams,
|
||||||
batch_size):
|
batch_size):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, LlamaTokenizer
|
from transformers import AutoTokenizer, LlamaTokenizer
|
||||||
|
|
||||||
os.environ["BIGDL_OPT_IPEX"] = "true"
|
os.environ["BIGDL_OPT_IPEX"] = "true"
|
||||||
|
|
@ -1434,7 +1434,7 @@ def run_deepspeed_optimize_model_gpu(repo_id,
|
||||||
os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500")
|
os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500")
|
||||||
|
|
||||||
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
import deepspeed
|
import deepspeed
|
||||||
from deepspeed.accelerator.cpu_accelerator import CPU_Accelerator
|
from deepspeed.accelerator.cpu_accelerator import CPU_Accelerator
|
||||||
|
|
@ -1535,9 +1535,9 @@ def run_speculative_cpu(repo_id,
|
||||||
num_trials,
|
num_trials,
|
||||||
num_beams,
|
num_beams,
|
||||||
batch_size):
|
batch_size):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, LlamaTokenizer
|
from transformers import AutoTokenizer, LlamaTokenizer
|
||||||
from bigdl.llm.transformers.convert import get_enable_ipex
|
from ipex_llm.transformers.convert import get_enable_ipex
|
||||||
|
|
||||||
_enable_ipex = get_enable_ipex()
|
_enable_ipex = get_enable_ipex()
|
||||||
|
|
||||||
|
|
@ -1615,7 +1615,7 @@ def run_speculative_gpu(repo_id,
|
||||||
num_trials,
|
num_trials,
|
||||||
num_beams,
|
num_beams,
|
||||||
batch_size):
|
batch_size):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, LlamaTokenizer
|
from transformers import AutoTokenizer, LlamaTokenizer
|
||||||
|
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@ current_dir = os.path.dirname(os.path.realpath(__file__))
|
||||||
def save_model_in_low_bit(repo_id,
|
def save_model_in_low_bit(repo_id,
|
||||||
local_model_hub,
|
local_model_hub,
|
||||||
low_bit):
|
low_bit):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, LlamaTokenizer
|
from transformers import AutoTokenizer, LlamaTokenizer
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
# Load model in 4 bit,
|
# Load model in 4 bit,
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@ import torch
|
||||||
import json
|
import json
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from bigdl.llm.utils.common.log4Error import invalidInputError
|
from ipex_llm.utils.common.log4Error import invalidInputError
|
||||||
from evaluators.qwen import QwenEvaluator
|
from evaluators.qwen import QwenEvaluator
|
||||||
from evaluators.llama import LlamaEvaluator
|
from evaluators.llama import LlamaEvaluator
|
||||||
from evaluators.chatglm import ChatGLMEvaluator
|
from evaluators.chatglm import ChatGLMEvaluator
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ from thefuzz import process
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from evaluators.evaluator import Evaluator
|
from evaluators.evaluator import Evaluator
|
||||||
from bigdl.llm.transformers import AutoModel
|
from ipex_llm.transformers import AutoModel
|
||||||
from transformers.generation.utils import LogitsProcessorList
|
from transformers.generation.utils import LogitsProcessorList
|
||||||
from transformers.generation.logits_process import LogitsProcessor
|
from transformers.generation.logits_process import LogitsProcessor
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from transformers import LlamaTokenizer, GenerationConfig
|
from transformers import LlamaTokenizer, GenerationConfig
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from evaluators.evaluator import Evaluator
|
from evaluators.evaluator import Evaluator
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ from thefuzz import process
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from transformers.generation import GenerationConfig
|
from transformers.generation import GenerationConfig
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from evaluators.evaluator import Evaluator
|
from evaluators.evaluator import Evaluator
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
from lm_eval.models.huggingface import AutoCausalLM
|
from lm_eval.models.huggingface import AutoCausalLM
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ from torch.nn import CrossEntropyLoss
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import gc
|
import gc
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
|
from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
|
||||||
|
|
||||||
class BigDLPPL:
|
class BigDLPPL:
|
||||||
def __init__(self, model_path, device, **model_kwargs) -> None:
|
def __init__(self, model_path, device, **model_kwargs) -> None:
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@ from datasets import concatenate_datasets, load_dataset
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from ppl import BigDLPPL
|
from ppl import BigDLPPL
|
||||||
from bigdl.llm.ggml.quantize import ggml_tensor_qtype
|
from ipex_llm.ggml.quantize import ggml_tensor_qtype
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
|
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
|
||||||
from transformers import WhisperProcessor
|
from transformers import WhisperProcessor
|
||||||
import torch
|
import torch
|
||||||
from evaluate import load
|
from evaluate import load
|
||||||
|
|
|
||||||
|
|
@ -69,11 +69,11 @@ conda activate autogen
|
||||||
cd autogen
|
cd autogen
|
||||||
|
|
||||||
# load the local model with cpu with your downloaded model
|
# load the local model with cpu with your downloaded model
|
||||||
python -m bigdl.llm.serving.model_worker --model-path ... --device cpu
|
python -m ipex_llm.serving.model_worker --model-path ... --device cpu
|
||||||
```
|
```
|
||||||
|
|
||||||
Change the Model Name:
|
Change the Model Name:
|
||||||
> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m bigdl.llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat.
|
> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m ipex_llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat.
|
||||||
|
|
||||||
Potential Error Note:
|
Potential Error Note:
|
||||||
> If you get `RuntimeError: Error register to Controller` in the worker terminal, please set `export no_proxy='localhost'` to ensure the registration
|
> If you get `RuntimeError: Error register to Controller` in the worker terminal, please set `export no_proxy='localhost'` to ensure the registration
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import argparse
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import AutoTokenizer, LocalAgent
|
from transformers import AutoTokenizer, LocalAgent
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="Run agent using vicuna model")
|
parser = argparse.ArgumentParser(description="Run agent using vicuna model")
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@
|
||||||
In this example, we apply low-bit optimizations to [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs.
|
In this example, we apply low-bit optimizations to [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs.
|
||||||
Only one code change is needed to load the model using bigdl-llm as follows:
|
Only one code change is needed to load the model using bigdl-llm as follows:
|
||||||
```python
|
```python
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
|
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,7 @@ import urllib.request
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
# code change to import from bigdl-llm API instead of using transformers API
|
# code change to import from bigdl-llm API instead of using transformers API
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import LlamaTokenizer
|
from transformers import LlamaTokenizer
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@ Distributed model managed by deepspeed can be further optimized with BigDL low-b
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Apply BigDL-LLM INT4 optimizations on transformers
|
# Apply BigDL-LLM INT4 optimizations on transformers
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
|
|
||||||
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
|
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
|
||||||
model = model.to(f'cpu:{local_rank}') # move partial model to local rank
|
model = model.to(f'cpu:{local_rank}') # move partial model to local rank
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ import os
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
|
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
|
||||||
import deepspeed
|
import deepspeed
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
import torch
|
import torch
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
import time
|
import time
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
# here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style
|
# here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import LlamaTokenizer, GPTQConfig
|
from transformers import LlamaTokenizer, GPTQConfig
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227
|
# prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModel
|
from ipex_llm.transformers import AutoModel
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModel
|
from ipex_llm.transformers import AutoModel
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModel
|
from ipex_llm.transformers import AutoModel
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModel
|
from ipex_llm.transformers import AutoModel
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModel
|
from ipex_llm.transformers import AutoModel
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import CodeLlamaTokenizer
|
from transformers import CodeLlamaTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
# here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template
|
# here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# Load model in 4 bit,
|
# Load model in 4 bit,
|
||||||
# which convert the relevant layers in the model into INT4 format
|
# which convert the relevant layers in the model into INT4 format
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True).eval()
|
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True).eval()
|
||||||
model.generation_config = GenerationConfig.from_pretrained(model_path)
|
model.generation_config = GenerationConfig.from_pretrained(model_path)
|
||||||
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
|
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer
|
from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -301,7 +301,7 @@ class Attention(nn.Module):
|
||||||
# resize qk to 4D to match apply_rotary_pos_emb_no_cache_xpu's requirements.
|
# resize qk to 4D to match apply_rotary_pos_emb_no_cache_xpu's requirements.
|
||||||
query_layer = query_layer.reshape(batch_size, self.num_heads, q_length, self.head_dim)
|
query_layer = query_layer.reshape(batch_size, self.num_heads, q_length, self.head_dim)
|
||||||
key_layer = key_layer.reshape(batch_size, self.num_kv, q_length, self.head_dim)
|
key_layer = key_layer.reshape(batch_size, self.num_kv, q_length, self.head_dim)
|
||||||
from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
|
from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
|
||||||
query_layer, key_layer = apply_rotary_pos_emb_no_cache_xpu(query_layer,
|
query_layer, key_layer = apply_rotary_pos_emb_no_cache_xpu(query_layer,
|
||||||
key_layer,
|
key_layer,
|
||||||
position_ids,
|
position_ids,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForSeq2SeqLM
|
from ipex_llm.transformers import AutoModelForSeq2SeqLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import torch
|
||||||
import argparse
|
import argparse
|
||||||
import time
|
import time
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Fuyu model')
|
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Fuyu model')
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# The instruction-tuned models use a chat template that must be adhered to for conversational use.
|
# The instruction-tuned models use a chat template that must be adhered to for conversational use.
|
||||||
|
|
|
||||||
|
|
@ -14,14 +14,14 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from transformers.generation import GenerationConfig
|
from transformers.generation import GenerationConfig
|
||||||
import torch
|
import torch
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for InternLM-XComposer model')
|
parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for InternLM-XComposer model')
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@ if __name__ == '__main__':
|
||||||
model_path = args.repo_id_or_model_path
|
model_path = args.repo_id_or_model_path
|
||||||
|
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True)
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import LlamaTokenizer
|
from transformers import LlamaTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GenerationConfig
|
from transformers import AutoTokenizer, GenerationConfig
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GenerationConfig
|
from transformers import AutoTokenizer, GenerationConfig
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GenerationConfig
|
from transformers import AutoTokenizer, GenerationConfig
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from transformers import AutoTokenizer, GenerationConfig
|
from transformers import AutoTokenizer, GenerationConfig
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
# here the prompt tuning refers to # TODO: https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py
|
# here the prompt tuning refers to # TODO: https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py
|
||||||
PHI1_5_PROMPT_FORMAT = " Question:{prompt}\n\n Answer:"
|
PHI1_5_PROMPT_FORMAT = " Question:{prompt}\n\n Answer:"
|
||||||
|
|
@ -41,7 +41,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# Load model in 4 bit,
|
# Load model in 4 bit,
|
||||||
# which convert the relevant layers in the model into INT4 format
|
# which convert the relevant layers in the model into INT4 format
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True)
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could revise it based on the Phoenix model you choose to use
|
# you could revise it based on the Phoenix model you choose to use
|
||||||
|
|
|
||||||
|
|
@ -14,14 +14,14 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, LlamaTokenizer
|
from transformers import AutoTokenizer, LlamaTokenizer
|
||||||
from transformers.generation import GenerationConfig
|
from transformers.generation import GenerationConfig
|
||||||
import torch
|
import torch
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
from bigdl.llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
torch.manual_seed(1234)
|
torch.manual_seed(1234)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model
|
# you could tune the prompt based on your own model
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ if __name__ == '__main__':
|
||||||
model_path = args.repo_id_or_model_path
|
model_path = args.repo_id_or_model_path
|
||||||
|
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True)
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import time
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import LlamaTokenizer
|
from transformers import LlamaTokenizer
|
||||||
|
|
||||||
# you could tune the prompt based on your own model,
|
# you could tune the prompt based on your own model,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ import librosa
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
|
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
|
||||||
from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer
|
from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
|
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
|
||||||
from transformers import WhisperProcessor
|
from transformers import WhisperProcessor
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import LlamaTokenizer
|
from transformers import LlamaTokenizer
|
||||||
|
|
||||||
WIZARDCODERPYTHON_PROMPT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
WIZARDCODERPYTHON_PROMPT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model
|
# Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import torch, transformers
|
||||||
import sys, os, time
|
import sys, os, time
|
||||||
import argparse
|
import argparse
|
||||||
from transformers import LlamaTokenizer
|
from transformers import LlamaTokenizer
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
# Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage
|
# Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage
|
||||||
YUAN2_PROMPT_FORMAT = """
|
YUAN2_PROMPT_FORMAT = """
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@ if __name__ == '__main__':
|
||||||
model_path = args.repo_id_or_model_path
|
model_path = args.repo_id_or_model_path
|
||||||
|
|
||||||
|
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
# enabling `use_cache=True` allows the model to utilize the previous
|
# enabling `use_cache=True` allows the model to utilize the previous
|
||||||
# key/values attentions to speed up decoding;
|
# key/values attentions to speed up decoding;
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import LlamaTokenizer, TextGenerationPipeline
|
from transformers import LlamaTokenizer, TextGenerationPipeline
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
@ -38,7 +38,7 @@ if __name__ == '__main__':
|
||||||
model = AutoModelForCausalLM.load_low_bit(load_path)
|
model = AutoModelForCausalLM.load_low_bit(load_path)
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(load_path)
|
tokenizer = LlamaTokenizer.from_pretrained(load_path)
|
||||||
else:
|
else:
|
||||||
# load_in_low_bit in bigdl.llm.transformers will convert
|
# load_in_low_bit in ipex_llm.transformers will convert
|
||||||
# the relevant layers in the model into corresponding int X format
|
# the relevant layers in the model into corresponding int X format
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import LlamaTokenizer, TextGenerationPipeline
|
from transformers import LlamaTokenizer, TextGenerationPipeline
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
@ -38,7 +38,7 @@ if __name__ == '__main__':
|
||||||
model = AutoModelForCausalLM.load_low_bit(load_path)
|
model = AutoModelForCausalLM.load_low_bit(load_path)
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(load_path)
|
tokenizer = LlamaTokenizer.from_pretrained(load_path)
|
||||||
else:
|
else:
|
||||||
# load_in_low_bit in bigdl.llm.transformers will convert
|
# load_in_low_bit in ipex_llm.transformers will convert
|
||||||
# the relevant layers in the model into corresponding int X format
|
# the relevant layers in the model into corresponding int X format
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
|
||||||
|
|
@ -31,8 +31,8 @@ from langchain.chains.question_answering import load_qa_chain
|
||||||
from langchain.callbacks.manager import CallbackManager
|
from langchain.callbacks.manager import CallbackManager
|
||||||
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
||||||
|
|
||||||
from bigdl.llm.langchain.llms import *
|
from ipex_llm.langchain.llms import *
|
||||||
from bigdl.llm.langchain.embeddings import *
|
from ipex_llm.langchain.embeddings import *
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.langchain.llms import *
|
from ipex_llm.langchain.llms import *
|
||||||
from langchain import PromptTemplate, LLMChain
|
from langchain import PromptTemplate, LLMChain
|
||||||
from langchain.callbacks.manager import CallbackManager
|
from langchain.callbacks.manager import CallbackManager
|
||||||
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@
|
||||||
|
|
||||||
|
|
||||||
from langchain import LLMChain, PromptTemplate
|
from langchain import LLMChain, PromptTemplate
|
||||||
from bigdl.llm.langchain.llms import *
|
from ipex_llm.langchain.llms import *
|
||||||
from langchain.memory import ConversationBufferWindowMemory
|
from langchain.memory import ConversationBufferWindowMemory
|
||||||
from langchain.callbacks.manager import CallbackManager
|
from langchain.callbacks.manager import CallbackManager
|
||||||
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
|
from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
|
||||||
from langchain import PromptTemplate, LLMChain
|
from langchain import PromptTemplate, LLMChain
|
||||||
from langchain import HuggingFacePipeline
|
from langchain import HuggingFacePipeline
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from langchain.chains import LLMMathChain
|
from langchain.chains import LLMMathChain
|
||||||
from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
|
from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
|
|
||||||
|
|
@ -30,8 +30,8 @@ from langchain.text_splitter import CharacterTextSplitter
|
||||||
from langchain.chains.question_answering import load_qa_chain
|
from langchain.chains.question_answering import load_qa_chain
|
||||||
from langchain.callbacks.manager import CallbackManager
|
from langchain.callbacks.manager import CallbackManager
|
||||||
|
|
||||||
from bigdl.llm.langchain.llms import TransformersLLM
|
from ipex_llm.langchain.llms import TransformersLLM
|
||||||
from bigdl.llm.langchain.embeddings import TransformersEmbeddings
|
from ipex_llm.langchain.embeddings import TransformersEmbeddings
|
||||||
|
|
||||||
text_doc = '''
|
text_doc = '''
|
||||||
BigDL seamlessly scales your data analytics & AI applications from laptop to cloud, with the following libraries:
|
BigDL seamlessly scales your data analytics & AI applications from laptop to cloud, with the following libraries:
|
||||||
|
|
|
||||||
|
|
@ -23,9 +23,9 @@
|
||||||
|
|
||||||
|
|
||||||
from langchain import LLMChain, PromptTemplate
|
from langchain import LLMChain, PromptTemplate
|
||||||
from bigdl.llm.langchain.llms import TransformersLLM
|
from ipex_llm.langchain.llms import TransformersLLM
|
||||||
from langchain.memory import ConversationBufferWindowMemory
|
from langchain.memory import ConversationBufferWindowMemory
|
||||||
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
|
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
|
||||||
from transformers import WhisperProcessor
|
from transformers import WhisperProcessor
|
||||||
import speech_recognition as sr
|
import speech_recognition as sr
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue