Refactor bigdl.llm to ipex_llm (#24)

* Rename bigdl/llm to ipex_llm

* rm python/llm/src/bigdl

* from bigdl.llm to from ipex_llm
This commit is contained in:
Wang, Jian4 2024-03-22 15:41:21 +08:00 committed by GitHub
parent cc5806f4bc
commit 9df70d95eb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
464 changed files with 918 additions and 940 deletions

View file

@ -86,7 +86,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
```python ```python
#load Hugging Face Transformers model with INT4 optimizations #load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
#run the optimized model on CPU #run the optimized model on CPU
@ -113,7 +113,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
```python ```python
#load Hugging Face Transformers model with INT4 optimizations #load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
#run the optimized model on Intel GPU #run the optimized model on Intel GPU

View file

@ -223,7 +223,7 @@ This controller manages the distributed workers.
##### Launch the model worker(s) ##### Launch the model worker(s)
```bash ```bash
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
``` ```
Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller. Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
@ -252,7 +252,7 @@ python3 -m fastchat.serve.controller
Then, launch the model worker(s): Then, launch the model worker(s):
```bash ```bash
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
``` ```
Finally, launch the RESTful API server Finally, launch the RESTful API server
@ -319,7 +319,7 @@ This controller manages the distributed workers.
##### Launch the model worker(s) ##### Launch the model worker(s)
```bash ```bash
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
``` ```
Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller. Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
@ -346,7 +346,7 @@ python3 -m fastchat.serve.controller
Then, launch the model worker(s): Then, launch the model worker(s):
```bash ```bash
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
``` ```
Finally, launch the RESTful API server Finally, launch the RESTful API server

View file

@ -23,7 +23,7 @@ from transformers import TextIteratorStreamer
from transformers.tools.agents import StopSequenceCriteria from transformers.tools.agents import StopSequenceCriteria
from transformers.generation.stopping_criteria import StoppingCriteriaList from transformers.generation.stopping_criteria import StoppingCriteriaList
from colorama import Fore from colorama import Fore
from bigdl.llm import optimize_model from ipex_llm import optimize_model
SYSTEM_PROMPT = "A chat between a curious human <human> and an artificial intelligence assistant <bot>.\ SYSTEM_PROMPT = "A chat between a curious human <human> and an artificial intelligence assistant <bot>.\
The assistant gives helpful, detailed, and polite answers to the human's questions." The assistant gives helpful, detailed, and polite answers to the human's questions."
HUMAN_ID = "<human>" HUMAN_ID = "<human>"

View file

@ -135,9 +135,9 @@ else
done done
if [ "$worker_type" == "model_worker" ]; then if [ "$worker_type" == "model_worker" ]; then
worker_type="bigdl.llm.serving.model_worker" worker_type="ipex_llm.serving.model_worker"
elif [ "$worker_type" == "vllm_worker" ]; then elif [ "$worker_type" == "vllm_worker" ]; then
worker_type="bigdl.llm.serving.vllm_worker" worker_type="ipex_llm.serving.vllm_worker"
fi fi
if [[ -n $CONTROLLER_HOST ]]; then if [[ -n $CONTROLLER_HOST ]]; then
@ -220,9 +220,9 @@ else
echo "Worker type: $worker_type" echo "Worker type: $worker_type"
echo "Worker address: $worker_address" echo "Worker address: $worker_address"
echo "Controller address: $controller_address" echo "Controller address: $controller_address"
if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
fi fi
fi fi

View file

@ -9,7 +9,7 @@
generation_config = GenerationConfig.from_pretrained( generation_config = GenerationConfig.from_pretrained(
model_path, trust_remote_code=True model_path, trust_remote_code=True
) )
+ from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
model_path, model_path,
config=config, config=config,

View file

@ -66,9 +66,9 @@ else
done done
if [ "$worker_type" == "model_worker" ]; then if [ "$worker_type" == "model_worker" ]; then
worker_type="bigdl.llm.serving.model_worker" worker_type="ipex_llm.serving.model_worker"
elif [ "$worker_type" == "vllm_worker" ]; then elif [ "$worker_type" == "vllm_worker" ]; then
worker_type="bigdl.llm.serving.vllm_worker" worker_type="ipex_llm.serving.vllm_worker"
fi fi
if [[ -n $CONTROLLER_HOST ]]; then if [[ -n $CONTROLLER_HOST ]]; then
@ -127,9 +127,9 @@ else
echo "Worker address: $worker_address" echo "Worker address: $worker_address"
echo "Controller address: $controller_address" echo "Controller address: $controller_address"
if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
fi fi
fi fi

View file

@ -21,7 +21,7 @@ To help you better understand the finetuning process, here we use model [Llama-2
First, load model using `transformers`-style API and **set it to `to('xpu')`**. We specify `load_in_low_bit="nf4"` here to apply 4-bit NormalFloat optimization. According to the [QLoRA paper](https://arxiv.org/pdf/2305.14314.pdf), using `"nf4"` could yield better model quality than `"int4"`. First, load model using `transformers`-style API and **set it to `to('xpu')`**. We specify `load_in_low_bit="nf4"` here to apply 4-bit NormalFloat optimization. According to the [QLoRA paper](https://arxiv.org/pdf/2305.14314.pdf), using `"nf4"` could yield better model quality than `"int4"`.
```python ```python
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",
load_in_low_bit="nf4", load_in_low_bit="nf4",
@ -33,14 +33,14 @@ model = model.to('xpu')
Then, we have to apply some preprocessing to the model to prepare it for training. Then, we have to apply some preprocessing to the model to prepare it for training.
```python ```python
from bigdl.llm.transformers.qlora import prepare_model_for_kbit_training from ipex_llm.transformers.qlora import prepare_model_for_kbit_training
model.gradient_checkpointing_enable() model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model) model = prepare_model_for_kbit_training(model)
``` ```
Next, we can obtain a Peft model from the optimized model and a configuration object containing the parameters as follows: Next, we can obtain a Peft model from the optimized model and a configuration object containing the parameters as follows:
```python ```python
from bigdl.llm.transformers.qlora import get_peft_model from ipex_llm.transformers.qlora import get_peft_model
from peft import LoraConfig from peft import LoraConfig
config = LoraConfig(r=8, config = LoraConfig(r=8,
lora_alpha=32, lora_alpha=32,
@ -54,7 +54,7 @@ model = get_peft_model(model, config)
```eval_rst ```eval_rst
.. important:: .. important::
Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``bigdl.llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``. Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``ipex_llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``.
``` ```
```eval_rst ```eval_rst

View file

@ -5,7 +5,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
```python ```python
# load Hugging Face Transformers model with INT4 optimizations # load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
``` ```

View file

@ -29,7 +29,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
# Take Llama-2-7b-chat-hf as an example # Take Llama-2-7b-chat-hf as an example
from transformers import LlamaForCausalLM from transformers import LlamaForCausalLM
from bigdl.llm import optimize_model from ipex_llm import optimize_model
model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True) model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True)
model = optimize_model(model) # With only one line to enable BigDL-LLM INT4 optimization model = optimize_model(model) # With only one line to enable BigDL-LLM INT4 optimization
@ -40,14 +40,14 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
When running LLMs on Intel iGPUs for Windows users, we recommend setting ``cpu_embedding=True`` in the ``optimize_model`` function. This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. When running LLMs on Intel iGPUs for Windows users, we recommend setting ``cpu_embedding=True`` in the ``optimize_model`` function. This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
See the `API doc <../../../PythonAPI/LLM/optimize.html#bigdl.llm.optimize_model>`_ for ``optimize_model`` to find more information. See the `API doc <../../../PythonAPI/LLM/optimize.html#ipex_llm.optimize_model>`_ for ``optimize_model`` to find more information.
Especially, if you have saved the optimized model following setps `here <./optimize_model.html#save>`_, the loading process on Intel GPUs maybe as follows: Especially, if you have saved the optimized model following setps `here <./optimize_model.html#save>`_, the loading process on Intel GPUs maybe as follows:
.. code-block:: python .. code-block:: python
from transformers import LlamaForCausalLM from transformers import LlamaForCausalLM
from bigdl.llm.optimize import low_memory_init, load_low_bit from ipex_llm.optimize import low_memory_init, load_low_bit
saved_dir='./llama-2-bigdl-llm-4-bit' saved_dir='./llama-2-bigdl-llm-4-bit'
with low_memory_init(): # Fast and low cost by loading model on meta device with low_memory_init(): # Fast and low cost by loading model on meta device
@ -65,7 +65,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
.. code-block:: python .. code-block:: python
# Take Llama-2-7b-chat-hf as an example # Take Llama-2-7b-chat-hf as an example
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
# Load model in 4 bit, which convert the relevant layers in the model into INT4 format # Load model in 4 bit, which convert the relevant layers in the model into INT4 format
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', load_in_4bit=True)
@ -82,7 +82,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
.. code-block:: python .. code-block:: python
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
saved_dir='./llama-2-bigdl-llm-4-bit' saved_dir='./llama-2-bigdl-llm-4-bit'
model = AutoModelForCausalLM.load_low_bit(saved_dir) # Load the optimized model model = AutoModelForCausalLM.load_low_bit(saved_dir) # Load the optimized model

View file

@ -7,8 +7,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows: You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
```python ```python
from bigdl.llm.langchain.llms import TransformersLLM from ipex_llm.langchain.llms import TransformersLLM
from bigdl.llm.langchain.embeddings import TransformersEmbeddings from ipex_llm.langchain.embeddings import TransformersEmbeddings
from langchain.chains.question_answering import load_qa_chain from langchain.chains.question_answering import load_qa_chain
embeddings = TransformersEmbeddings.from_model_id(model_id=model_path) embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
@ -37,8 +37,8 @@ You may also convert Hugging Face *Transformers* models into native INT4 format,
``` ```
```python ```python
from bigdl.llm.langchain.llms import LlamaLLM from ipex_llm.langchain.llms import LlamaLLM
from bigdl.llm.langchain.embeddings import LlamaEmbeddings from ipex_llm.langchain.embeddings import LlamaEmbeddings
from langchain.chains.question_answering import load_qa_chain from langchain.chains.question_answering import load_qa_chain
# switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models # switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models

View file

@ -10,13 +10,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 format
```python ```python
# convert the model # convert the model
from bigdl.llm import llm_convert from ipex_llm import llm_convert
bigdl_llm_path = llm_convert(model='/path/to/model/', bigdl_llm_path = llm_convert(model='/path/to/model/',
outfile='/path/to/output/', outtype='int4', model_family="llama") outfile='/path/to/output/', outtype='int4', model_family="llama")
# load the converted model # load the converted model
# switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models # switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
from bigdl.llm.transformers import LlamaForCausalLM from ipex_llm.transformers import LlamaForCausalLM
llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...) llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
# run the converted model # run the converted model

View file

@ -14,7 +14,7 @@ model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_
Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default: Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default:
```python ```python
from bigdl.llm import optimize_model from ipex_llm import optimize_model
# With only one line to enable BigDL-LLM INT4 optimization # With only one line to enable BigDL-LLM INT4 optimization
model = optimize_model(model) model = optimize_model(model)
@ -31,7 +31,7 @@ Currently, ``low_bit`` supports options 'sym_int4', 'asym_int4', 'sym_int5', 'as
You may apply symmetric INT8 optimization as follows: You may apply symmetric INT8 optimization as follows:
```python ```python
from bigdl.llm import optimize_model from ipex_llm import optimize_model
# Apply symmetric INT8 optimization # Apply symmetric INT8 optimization
model = optimize_model(model, low_bit="sym_int8") model = optimize_model(model, low_bit="sym_int8")
@ -51,7 +51,7 @@ model.save_low_bit(saved_dir)
We recommend to use the context manager `low_memory_init` to quickly initiate a model instance with low cost, and then use `load_low_bit` to load the optimized low-bit model as follows: We recommend to use the context manager `low_memory_init` to quickly initiate a model instance with low cost, and then use `load_low_bit` to load the optimized low-bit model as follows:
```python ```python
from bigdl.llm.optimize import low_memory_init, load_low_bit from ipex_llm.optimize import low_memory_init, load_low_bit
with low_memory_init(): # Fast and low cost by loading model on meta device with low_memory_init(): # Fast and low cost by loading model on meta device
model = LlamaForCausalLM.from_pretrained(saved_dir, model = LlamaForCausalLM.from_pretrained(saved_dir,
torch_dtype="auto", torch_dtype="auto",

View file

@ -11,7 +11,7 @@ Here, let's take a relatively small LLM model, i.e [open_llama_3b_v2](https://hu
Simply use one-line `transformers`-style API in `bigdl-llm` to load `open_llama_3b_v2` with INT4 optimization (by specifying `load_in_4bit=True`) as follows: Simply use one-line `transformers`-style API in `bigdl-llm` to load `open_llama_3b_v2` with INT4 optimization (by specifying `load_in_4bit=True`) as follows:
```python ```python
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="openlm-research/open_llama_3b_v2", model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="openlm-research/open_llama_3b_v2",
load_in_4bit=True) load_in_4bit=True)

View file

@ -112,7 +112,7 @@ Install the Miniconda as follows if you don't have conda installed on your machi
python python
> from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM > from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
``` ```
> <img src="https://llm-assets.readthedocs.io/en/latest/_images/verify_bigdl_import.png" alt="image-20240221102252562" width=100%; /> > <img src="https://llm-assets.readthedocs.io/en/latest/_images/verify_bigdl_import.png" alt="image-20240221102252562" width=100%; />
@ -170,7 +170,7 @@ Now let's play with a real LLM. We'll be using the [phi-1.5](https://huggingface
```python ```python
# Copy/Paste the contents to a new file demo.py # Copy/Paste the contents to a new file demo.py
import torch import torch
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig from transformers import AutoTokenizer, GenerationConfig
generation_config = GenerationConfig(use_cache = True) generation_config = GenerationConfig(use_cache = True)

View file

@ -130,7 +130,7 @@ You can verify if `bigdl-llm` is successfully installed by simply running a few
* Step 5: Copy following code to Anaconda prompt **line by line** and press Enter **after copying each line**. * Step 5: Copy following code to Anaconda prompt **line by line** and press Enter **after copying each line**.
```python ```python
import torch import torch
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
tensor_1 = torch.randn(1, 1, 40, 128).to('xpu') tensor_1 = torch.randn(1, 1, 40, 128).to('xpu')
tensor_2 = torch.randn(1, 1, 128, 40).to('xpu') tensor_2 = torch.randn(1, 1, 128, 40).to('xpu')
print(torch.matmul(tensor_1, tensor_2).size()) print(torch.matmul(tensor_1, tensor_2).size())
@ -200,7 +200,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
# Copy/Paste the contents to a new file demo.py # Copy/Paste the contents to a new file demo.py
import torch import torch
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig from transformers import AutoTokenizer, GenerationConfig
generation_config = GenerationConfig(use_cache=True) generation_config = GenerationConfig(use_cache=True)
@ -260,7 +260,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
# Copy/Paste the contents to a new file demo.py # Copy/Paste the contents to a new file demo.py
import torch import torch
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import GenerationConfig from transformers import GenerationConfig
from modelscope import AutoTokenizer from modelscope import AutoTokenizer
generation_config = GenerationConfig(use_cache=True) generation_config = GenerationConfig(use_cache=True)

View file

@ -13,7 +13,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im
.. tab:: AutoModel .. tab:: AutoModel
.. automodule:: bigdl.llm.langchain.llms.transformersllm .. automodule:: ipex_llm.langchain.llms.transformersllm
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -21,7 +21,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im
.. tab:: pipeline .. tab:: pipeline
.. automodule:: bigdl.llm.langchain.llms.transformerspipelinellm .. automodule:: ipex_llm.langchain.llms.transformerspipelinellm
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -37,7 +37,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Llama .. tab:: Llama
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.LlamaLLM .. autoclass:: ipex_llm.langchain.llms.bigdlllm.LlamaLLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -49,7 +49,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: ChatGLM .. tab:: ChatGLM
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.ChatGLMLLM .. autoclass:: ipex_llm.langchain.llms.bigdlllm.ChatGLMLLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -61,7 +61,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Bloom .. tab:: Bloom
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.BloomLLM .. autoclass:: ipex_llm.langchain.llms.bigdlllm.BloomLLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -73,7 +73,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Gptneox .. tab:: Gptneox
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.GptneoxLLM .. autoclass:: ipex_llm.langchain.llms.bigdlllm.GptneoxLLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -85,7 +85,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Starcoder .. tab:: Starcoder
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.StarcoderLLM .. autoclass:: ipex_llm.langchain.llms.bigdlllm.StarcoderLLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -102,7 +102,7 @@ Embeddings Wrapper of LangChain
Hugging Face ``transformers`` AutoModel Hugging Face ``transformers`` AutoModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. automodule:: bigdl.llm.langchain.embeddings.transformersembeddings .. automodule:: ipex_llm.langchain.embeddings.transformersembeddings
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -117,7 +117,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
.. tab:: Llama .. tab:: Llama
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.LlamaEmbeddings .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.LlamaEmbeddings
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -129,7 +129,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
.. tab:: Bloom .. tab:: Bloom
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.BloomEmbeddings .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.BloomEmbeddings
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -141,7 +141,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
.. tab:: Gptneox .. tab:: Gptneox
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -153,7 +153,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
.. tab:: Starcoder .. tab:: Starcoder
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:

View file

@ -6,7 +6,7 @@ Optimize Model
You can run any PyTorch model with ``optimize_model`` through only one-line code change to benefit from BigDL-LLM optimization, regardless of the library or API you are using. You can run any PyTorch model with ``optimize_model`` through only one-line code change to benefit from BigDL-LLM optimization, regardless of the library or API you are using.
.. automodule:: bigdl.llm .. automodule:: ipex_llm
:members: optimize_model :members: optimize_model
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -18,7 +18,7 @@ Load Optimized Model
To avoid high resource consumption during the loading processes of the original model, we provide save/load API to support the saving of model after low-bit optimization and the loading of the saved low-bit model. Saving and loading operations are platform-independent, regardless of their operating systems. To avoid high resource consumption during the loading processes of the original model, we provide save/load API to support the saving of model after low-bit optimization and the loading of the saved low-bit model. Saving and loading operations are platform-independent, regardless of their operating systems.
.. automodule:: bigdl.llm.optimize .. automodule:: ipex_llm.optimize
:members: load_low_bit :members: load_low_bit
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:

View file

@ -10,7 +10,7 @@ You can apply BigDL-LLM optimizations on any Hugging Face Transformers models by
AutoModelForCausalLM AutoModelForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: bigdl.llm.transformers.AutoModelForCausalLM .. autoclass:: ipex_llm.transformers.AutoModelForCausalLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -22,7 +22,7 @@ AutoModelForCausalLM
AutoModel AutoModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: bigdl.llm.transformers.AutoModel .. autoclass:: ipex_llm.transformers.AutoModel
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -34,7 +34,7 @@ AutoModel
AutoModelForSpeechSeq2Seq AutoModelForSpeechSeq2Seq
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: bigdl.llm.transformers.AutoModelForSpeechSeq2Seq .. autoclass:: ipex_llm.transformers.AutoModelForSpeechSeq2Seq
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -46,7 +46,7 @@ AutoModelForSpeechSeq2Seq
AutoModelForSeq2SeqLM AutoModelForSeq2SeqLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: bigdl.llm.transformers.AutoModelForSeq2SeqLM .. autoclass:: ipex_llm.transformers.AutoModelForSeq2SeqLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -67,7 +67,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Llama .. tab:: Llama
.. autoclass:: bigdl.llm.transformers.LlamaForCausalLM .. autoclass:: ipex_llm.transformers.LlamaForCausalLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -77,7 +77,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: ChatGLM .. tab:: ChatGLM
.. autoclass:: bigdl.llm.transformers.ChatGLMForCausalLM .. autoclass:: ipex_llm.transformers.ChatGLMForCausalLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -87,7 +87,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Gptneox .. tab:: Gptneox
.. autoclass:: bigdl.llm.transformers.GptneoxForCausalLM .. autoclass:: ipex_llm.transformers.GptneoxForCausalLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -96,7 +96,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. automethod:: from_pretrained .. automethod:: from_pretrained
.. tab:: Bloom .. tab:: Bloom
.. autoclass:: bigdl.llm.transformers.BloomForCausalLM .. autoclass:: ipex_llm.transformers.BloomForCausalLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@ -106,7 +106,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Starcoder .. tab:: Starcoder
.. autoclass:: bigdl.llm.transformers.StarcoderForCausalLM .. autoclass:: ipex_llm.transformers.StarcoderForCausalLM
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:

View file

@ -112,7 +112,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models
.. code-block:: python .. code-block:: python
#load Hugging Face Transformers model with INT4 optimizations #load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
#run the optimized model on Intel CPU #run the optimized model on Intel CPU
@ -146,7 +146,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models
.. code-block:: python .. code-block:: python
#load Hugging Face Transformers model with INT4 optimizations #load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
#run the optimized model on Intel GPU #run the optimized model on Intel GPU

View file

@ -146,7 +146,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int
```python ```python
#load Hugging Face Transformers model with INT4 optimizations #load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
#run the optimized model on Intel CPU #run the optimized model on Intel CPU
@ -164,7 +164,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int
```python ```python
#load Hugging Face Transformers model with INT4 optimizations #load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
import intel_extension_for_pytorch import intel_extension_for_pytorch
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
@ -206,13 +206,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 model f
```python ```python
#convert the model #convert the model
from bigdl.llm import llm_convert from ipex_llm import llm_convert
bigdl_llm_path = llm_convert(model='/path/to/model/', bigdl_llm_path = llm_convert(model='/path/to/model/',
outfile='/path/to/output/', outtype='int4', model_family="llama") outfile='/path/to/output/', outtype='int4', model_family="llama")
#load the converted model #load the converted model
#switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models #switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
from bigdl.llm.transformers import LlamaForCausalLM from ipex_llm.transformers import LlamaForCausalLM
llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...) llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
#run the converted model #run the converted model
@ -231,8 +231,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows: You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
```python ```python
from bigdl.llm.langchain.llms import TransformersLLM from ipex_llm.langchain.llms import TransformersLLM
from bigdl.llm.langchain.embeddings import TransformersEmbeddings from ipex_llm.langchain.embeddings import TransformersEmbeddings
from langchain.chains.question_answering import load_qa_chain from langchain.chains.question_answering import load_qa_chain
embeddings = TransformersEmbeddings.from_model_id(model_id=model_path) embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
@ -250,8 +250,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
>**Notes**:* Currently only llama/bloom/gptneox/starcoder/chatglm model families are supported; for other models, you may use the Hugging Face `transformers` model format as described above). >**Notes**:* Currently only llama/bloom/gptneox/starcoder/chatglm model families are supported; for other models, you may use the Hugging Face `transformers` model format as described above).
```python ```python
from bigdl.llm.langchain.llms import LlamaLLM from ipex_llm.langchain.llms import LlamaLLM
from bigdl.llm.langchain.embeddings import LlamaEmbeddings from ipex_llm.langchain.embeddings import LlamaEmbeddings
from langchain.chains.question_answering import load_qa_chain from langchain.chains.question_answering import load_qa_chain
#switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models #switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models

View file

@ -7,7 +7,7 @@ Just put this file into your benchmark directory, and then wrap your transformer
Take `chatglm-6b` as an example: Take `chatglm-6b` as an example:
```python ```python
import torch import torch
from bigdl.llm.transformers import AutoModel from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer from transformers import AutoTokenizer
from benchmark_util import BenchmarkWrapper from benchmark_util import BenchmarkWrapper
@ -35,7 +35,7 @@ Take `chatglm-6b` as an example:
```python ```python
import torch import torch
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
from bigdl.llm.transformers import AutoModel from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer from transformers import AutoTokenizer
from benchmark_util import BenchmarkWrapper from benchmark_util import BenchmarkWrapper

View file

@ -31,7 +31,7 @@ benchmark_util_path = os.path.join(current_dir, '..')
import sys import sys
sys.path.append(benchmark_util_path) sys.path.append(benchmark_util_path)
from benchmark_util import BenchmarkWrapper from benchmark_util import BenchmarkWrapper
from bigdl.llm.utils.common.log4Error import invalidInputError from ipex_llm.utils.common.log4Error import invalidInputError
LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf', LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf', 'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
@ -85,7 +85,7 @@ def run_transformer_int4(repo_id,
num_trials, num_trials,
num_beams, num_beams,
low_bit): low_bit):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -149,7 +149,7 @@ def run_transformer_int4_gpu(repo_id,
num_trials, num_trials,
num_beams, num_beams,
low_bit): low_bit):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
reserved_mem_list = [] reserved_mem_list = []

View file

@ -32,7 +32,7 @@ benchmark_util_path = os.path.join(current_dir, '..')
import sys import sys
sys.path.append(benchmark_util_path) sys.path.append(benchmark_util_path)
from benchmark_util import BenchmarkWrapper from benchmark_util import BenchmarkWrapper
from bigdl.llm.utils.common.log4Error import invalidInputError from ipex_llm.utils.common.log4Error import invalidInputError
LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf', LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf', 'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
@ -143,8 +143,8 @@ def run_native_int4(repo_id,
warm_up, warm_up,
num_trials): num_trials):
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
from bigdl.llm.transformers import BigdlNativeForCausalLM from ipex_llm.transformers import BigdlNativeForCausalLM
from bigdl.llm import llm_convert from ipex_llm import llm_convert
if "chatglm" in repo_id.lower(): if "chatglm" in repo_id.lower():
family = "chatglm" family = "chatglm"
elif "llama" in repo_id.lower(): elif "llama" in repo_id.lower():
@ -184,7 +184,7 @@ def run_transformer_int4(repo_id,
num_beams, num_beams,
low_bit, low_bit,
batch_size): batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -319,7 +319,7 @@ def run_optimize_model(repo_id,
low_bit, low_bit,
batch_size): batch_size):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
from bigdl.llm import optimize_model from ipex_llm import optimize_model
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit, # Load model in 4 bit,
@ -389,7 +389,7 @@ def run_transformer_int4_gpu(repo_id,
num_beams, num_beams,
low_bit, low_bit,
batch_size): batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -495,7 +495,7 @@ def run_optimize_model_gpu(repo_id,
low_bit, low_bit,
batch_size): batch_size):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from bigdl.llm import optimize_model from ipex_llm import optimize_model
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit, # Load model in 4 bit,
@ -651,7 +651,7 @@ def run_bigdl_fp16_gpu(repo_id,
num_trials, num_trials,
num_beams, num_beams,
batch_size): batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -731,7 +731,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
batch_size): batch_size):
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
import deepspeed import deepspeed
from bigdl.llm import optimize_model from ipex_llm import optimize_model
import argparse import argparse
# parser is for deepspeed subprocesses' inline parameter # parser is for deepspeed subprocesses' inline parameter
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model') parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
@ -822,7 +822,7 @@ def run_transformer_int4_gpu_win(repo_id,
cpu_embedding, cpu_embedding,
batch_size, batch_size,
streaming): streaming):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -928,7 +928,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
cpu_embedding, cpu_embedding,
batch_size, batch_size,
streaming): streaming):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -1038,7 +1038,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
cpu_embedding, cpu_embedding,
batch_size, batch_size,
streaming): streaming):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -1140,7 +1140,7 @@ def run_transformer_autocast_bf16( repo_id,
num_trials, num_trials,
num_beams, num_beams,
batch_size): batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -1209,7 +1209,7 @@ def run_bigdl_ipex_bf16(repo_id,
num_trials, num_trials,
num_beams, num_beams,
batch_size): batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer from transformers import AutoTokenizer, LlamaTokenizer
os.environ["BIGDL_OPT_IPEX"] = "true" os.environ["BIGDL_OPT_IPEX"] = "true"
@ -1280,7 +1280,7 @@ def run_bigdl_ipex_int4(repo_id,
num_trials, num_trials,
num_beams, num_beams,
batch_size): batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer from transformers import AutoTokenizer, LlamaTokenizer
os.environ["BIGDL_OPT_IPEX"] = "true" os.environ["BIGDL_OPT_IPEX"] = "true"
@ -1350,7 +1350,7 @@ def run_bigdl_ipex_int8(repo_id,
num_trials, num_trials,
num_beams, num_beams,
batch_size): batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer from transformers import AutoTokenizer, LlamaTokenizer
os.environ["BIGDL_OPT_IPEX"] = "true" os.environ["BIGDL_OPT_IPEX"] = "true"
@ -1434,7 +1434,7 @@ def run_deepspeed_optimize_model_gpu(repo_id,
os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500") os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500")
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from bigdl.llm import optimize_model from ipex_llm import optimize_model
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
import deepspeed import deepspeed
from deepspeed.accelerator.cpu_accelerator import CPU_Accelerator from deepspeed.accelerator.cpu_accelerator import CPU_Accelerator
@ -1535,9 +1535,9 @@ def run_speculative_cpu(repo_id,
num_trials, num_trials,
num_beams, num_beams,
batch_size): batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer from transformers import AutoTokenizer, LlamaTokenizer
from bigdl.llm.transformers.convert import get_enable_ipex from ipex_llm.transformers.convert import get_enable_ipex
_enable_ipex = get_enable_ipex() _enable_ipex = get_enable_ipex()
@ -1615,7 +1615,7 @@ def run_speculative_gpu(repo_id,
num_trials, num_trials,
num_beams, num_beams,
batch_size): batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)

View file

@ -30,7 +30,7 @@ current_dir = os.path.dirname(os.path.realpath(__file__))
def save_model_in_low_bit(repo_id, def save_model_in_low_bit(repo_id,
local_model_hub, local_model_hub,
low_bit): low_bit):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit, # Load model in 4 bit,

View file

@ -21,7 +21,7 @@ import torch
import json import json
from tqdm import tqdm from tqdm import tqdm
from bigdl.llm.utils.common.log4Error import invalidInputError from ipex_llm.utils.common.log4Error import invalidInputError
from evaluators.qwen import QwenEvaluator from evaluators.qwen import QwenEvaluator
from evaluators.llama import LlamaEvaluator from evaluators.llama import LlamaEvaluator
from evaluators.chatglm import ChatGLMEvaluator from evaluators.chatglm import ChatGLMEvaluator

View file

@ -22,7 +22,7 @@ from thefuzz import process
from transformers import AutoTokenizer from transformers import AutoTokenizer
from evaluators.evaluator import Evaluator from evaluators.evaluator import Evaluator
from bigdl.llm.transformers import AutoModel from ipex_llm.transformers import AutoModel
from transformers.generation.utils import LogitsProcessorList from transformers.generation.utils import LogitsProcessorList
from transformers.generation.logits_process import LogitsProcessor from transformers.generation.logits_process import LogitsProcessor

View file

@ -22,7 +22,7 @@ import numpy as np
import torch import torch
from transformers import LlamaTokenizer, GenerationConfig from transformers import LlamaTokenizer, GenerationConfig
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from evaluators.evaluator import Evaluator from evaluators.evaluator import Evaluator

View file

@ -22,7 +22,7 @@ from thefuzz import process
from transformers import AutoTokenizer from transformers import AutoTokenizer
from transformers.generation import GenerationConfig from transformers.generation import GenerationConfig
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from evaluators.evaluator import Evaluator from evaluators.evaluator import Evaluator

View file

@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
# #
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
import inspect import inspect
from lm_eval.models.huggingface import AutoCausalLM from lm_eval.models.huggingface import AutoCausalLM

View file

@ -20,7 +20,7 @@ from torch.nn import CrossEntropyLoss
from tqdm import tqdm from tqdm import tqdm
import gc import gc
from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
class BigDLPPL: class BigDLPPL:
def __init__(self, model_path, device, **model_kwargs) -> None: def __init__(self, model_path, device, **model_kwargs) -> None:

View file

@ -21,7 +21,7 @@ from datasets import concatenate_datasets, load_dataset
from transformers import AutoTokenizer from transformers import AutoTokenizer
from ppl import BigDLPPL from ppl import BigDLPPL
from bigdl.llm.ggml.quantize import ggml_tensor_qtype from ipex_llm.ggml.quantize import ggml_tensor_qtype
import os import os
import json import json

View file

@ -15,7 +15,7 @@
# #
from datasets import load_dataset from datasets import load_dataset
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from transformers import WhisperProcessor from transformers import WhisperProcessor
import torch import torch
from evaluate import load from evaluate import load

View file

@ -69,11 +69,11 @@ conda activate autogen
cd autogen cd autogen
# load the local model with cpu with your downloaded model # load the local model with cpu with your downloaded model
python -m bigdl.llm.serving.model_worker --model-path ... --device cpu python -m ipex_llm.serving.model_worker --model-path ... --device cpu
``` ```
Change the Model Name: Change the Model Name:
> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m bigdl.llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat. > Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m ipex_llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat.
Potential Error Note: Potential Error Note:
> If you get `RuntimeError: Error register to Controller` in the worker terminal, please set `export no_proxy='localhost'` to ensure the registration > If you get `RuntimeError: Error register to Controller` in the worker terminal, please set `export no_proxy='localhost'` to ensure the registration

View file

@ -19,7 +19,7 @@ import argparse
from PIL import Image from PIL import Image
from transformers import AutoTokenizer, LocalAgent from transformers import AutoTokenizer, LocalAgent
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run agent using vicuna model") parser = argparse.ArgumentParser(description="Run agent using vicuna model")

View file

@ -3,7 +3,7 @@
In this example, we apply low-bit optimizations to [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs. In this example, we apply low-bit optimizations to [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs.
Only one code change is needed to load the model using bigdl-llm as follows: Only one code change is needed to load the model using bigdl-llm as follows:
```python ```python
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False) model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
``` ```

View file

@ -49,7 +49,7 @@ import urllib.request
import os import os
import json import json
# code change to import from bigdl-llm API instead of using transformers API # code change to import from bigdl-llm API instead of using transformers API
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer from transformers import LlamaTokenizer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex

View file

@ -39,7 +39,7 @@ Distributed model managed by deepspeed can be further optimized with BigDL low-b
```python ```python
# Apply BigDL-LLM INT4 optimizations on transformers # Apply BigDL-LLM INT4 optimizations on transformers
from bigdl.llm import optimize_model from ipex_llm import optimize_model
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4') model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
model = model.to(f'cpu:{local_rank}') # move partial model to local rank model = model.to(f'cpu:{local_rank}') # move partial model to local rank

View file

@ -45,7 +45,7 @@ import os
import torch import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
import deepspeed import deepspeed
from bigdl.llm import optimize_model from ipex_llm import optimize_model
import torch import torch
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
import time import time

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,
# here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer, GPTQConfig from transformers import LlamaTokenizer, GPTQConfig
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227 # prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModel from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModel from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModel from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer from transformers import AutoTokenizer

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModel from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModel from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer from transformers import AutoTokenizer

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import CodeLlamaTokenizer from transformers import CodeLlamaTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
from transformers import AutoTokenizer from transformers import AutoTokenizer
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,
# here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template # here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template

View file

@ -36,7 +36,7 @@ if __name__ == '__main__':
# Load model in 4 bit, # Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format # which convert the relevant layers in the model into INT4 format
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True).eval() model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True).eval()
model.generation_config = GenerationConfig.from_pretrained(model_path) model.generation_config = GenerationConfig.from_pretrained(model_path)
model.generation_config.pad_token_id = model.generation_config.eos_token_id model.generation_config.pad_token_id = model.generation_config.eos_token_id

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -17,7 +17,7 @@
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from datasets import load_dataset from datasets import load_dataset
from transformers import pipeline from transformers import pipeline
from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -301,7 +301,7 @@ class Attention(nn.Module):
# resize qk to 4D to match apply_rotary_pos_emb_no_cache_xpu's requirements. # resize qk to 4D to match apply_rotary_pos_emb_no_cache_xpu's requirements.
query_layer = query_layer.reshape(batch_size, self.num_heads, q_length, self.head_dim) query_layer = query_layer.reshape(batch_size, self.num_heads, q_length, self.head_dim)
key_layer = key_layer.reshape(batch_size, self.num_kv, q_length, self.head_dim) key_layer = key_layer.reshape(batch_size, self.num_kv, q_length, self.head_dim)
from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
query_layer, key_layer = apply_rotary_pos_emb_no_cache_xpu(query_layer, query_layer, key_layer = apply_rotary_pos_emb_no_cache_xpu(query_layer,
key_layer, key_layer,
position_ids, position_ids,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModelForSeq2SeqLM from ipex_llm.transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import torch
import argparse import argparse
import time import time
from PIL import Image from PIL import Image
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Fuyu model') parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Fuyu model')

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# The instruction-tuned models use a chat template that must be adhered to for conversational use. # The instruction-tuned models use a chat template that must be adhered to for conversational use.

View file

@ -14,14 +14,14 @@
# limitations under the License. # limitations under the License.
# #
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
from transformers.generation import GenerationConfig from transformers.generation import GenerationConfig
import torch import torch
import time import time
import os import os
import argparse import argparse
from bigdl.llm import optimize_model from ipex_llm import optimize_model
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for InternLM-XComposer model') parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for InternLM-XComposer model')

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -39,7 +39,7 @@ if __name__ == '__main__':
model_path = args.repo_id_or_model_path model_path = args.repo_id_or_model_path
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path, model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True, load_in_4bit=True,
trust_remote_code=True) trust_remote_code=True)

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer from transformers import LlamaTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig from transformers import AutoTokenizer, GenerationConfig
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig from transformers import AutoTokenizer, GenerationConfig
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig from transformers import AutoTokenizer, GenerationConfig
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -20,7 +20,7 @@ import argparse
import numpy as np import numpy as np
from transformers import AutoTokenizer, GenerationConfig from transformers import AutoTokenizer, GenerationConfig
from bigdl.llm import optimize_model from ipex_llm import optimize_model
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,
# here the prompt tuning refers to # TODO: https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py # here the prompt tuning refers to # TODO: https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py
PHI1_5_PROMPT_FORMAT = " Question:{prompt}\n\n Answer:" PHI1_5_PROMPT_FORMAT = " Question:{prompt}\n\n Answer:"
@ -41,7 +41,7 @@ if __name__ == '__main__':
# Load model in 4 bit, # Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format # which convert the relevant layers in the model into INT4 format
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path, model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True, load_in_4bit=True,
trust_remote_code=True) trust_remote_code=True)

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could revise it based on the Phoenix model you choose to use # you could revise it based on the Phoenix model you choose to use

View file

@ -14,14 +14,14 @@
# limitations under the License. # limitations under the License.
# #
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer from transformers import AutoTokenizer, LlamaTokenizer
from transformers.generation import GenerationConfig from transformers.generation import GenerationConfig
import torch import torch
import time import time
import os import os
import argparse import argparse
from bigdl.llm import optimize_model from ipex_llm import optimize_model
torch.manual_seed(1234) torch.manual_seed(1234)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model # you could tune the prompt based on your own model

View file

@ -36,7 +36,7 @@ if __name__ == '__main__':
model_path = args.repo_id_or_model_path model_path = args.repo_id_or_model_path
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path, model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True, load_in_4bit=True,
trust_remote_code=True) trust_remote_code=True)

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer from transformers import LlamaTokenizer
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import librosa
import argparse import argparse
from transformers import pipeline from transformers import pipeline
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from transformers import WhisperProcessor from transformers import WhisperProcessor
from datasets import load_dataset from datasets import load_dataset

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer from transformers import LlamaTokenizer
WIZARDCODERPYTHON_PROMPT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request. WIZARDCODERPYTHON_PROMPT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

View file

@ -18,7 +18,7 @@ import torch
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
# Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model # Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model

View file

@ -18,7 +18,7 @@ import torch, transformers
import sys, os, time import sys, os, time
import argparse import argparse
from transformers import LlamaTokenizer from transformers import LlamaTokenizer
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
# Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage # Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage
YUAN2_PROMPT_FORMAT = """ YUAN2_PROMPT_FORMAT = """

View file

@ -39,7 +39,7 @@ if __name__ == '__main__':
model_path = args.repo_id_or_model_path model_path = args.repo_id_or_model_path
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
# enabling `use_cache=True` allows the model to utilize the previous # enabling `use_cache=True` allows the model to utilize the previous
# key/values attentions to speed up decoding; # key/values attentions to speed up decoding;
# to obtain optimal performance with BigDL-LLM INT4 optimizations, # to obtain optimal performance with BigDL-LLM INT4 optimizations,

View file

@ -15,7 +15,7 @@
# #
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer, TextGenerationPipeline from transformers import LlamaTokenizer, TextGenerationPipeline
if __name__ == '__main__': if __name__ == '__main__':
@ -38,7 +38,7 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.load_low_bit(load_path) model = AutoModelForCausalLM.load_low_bit(load_path)
tokenizer = LlamaTokenizer.from_pretrained(load_path) tokenizer = LlamaTokenizer.from_pretrained(load_path)
else: else:
# load_in_low_bit in bigdl.llm.transformers will convert # load_in_low_bit in ipex_llm.transformers will convert
# the relevant layers in the model into corresponding int X format # the relevant layers in the model into corresponding int X format
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)

View file

@ -15,7 +15,7 @@
# #
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer, TextGenerationPipeline from transformers import LlamaTokenizer, TextGenerationPipeline
if __name__ == '__main__': if __name__ == '__main__':
@ -38,7 +38,7 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.load_low_bit(load_path) model = AutoModelForCausalLM.load_low_bit(load_path)
tokenizer = LlamaTokenizer.from_pretrained(load_path) tokenizer = LlamaTokenizer.from_pretrained(load_path)
else: else:
# load_in_low_bit in bigdl.llm.transformers will convert # load_in_low_bit in ipex_llm.transformers will convert
# the relevant layers in the model into corresponding int X format # the relevant layers in the model into corresponding int X format
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)

View file

@ -31,8 +31,8 @@ from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks.manager import CallbackManager from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from bigdl.llm.langchain.llms import * from ipex_llm.langchain.llms import *
from bigdl.llm.langchain.embeddings import * from ipex_llm.langchain.embeddings import *
def main(args): def main(args):

View file

@ -21,7 +21,7 @@
import argparse import argparse
from bigdl.llm.langchain.llms import * from ipex_llm.langchain.llms import *
from langchain import PromptTemplate, LLMChain from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

View file

@ -23,7 +23,7 @@
from langchain import LLMChain, PromptTemplate from langchain import LLMChain, PromptTemplate
from bigdl.llm.langchain.llms import * from ipex_llm.langchain.llms import *
from langchain.memory import ConversationBufferWindowMemory from langchain.memory import ConversationBufferWindowMemory
from langchain.callbacks.manager import CallbackManager from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

View file

@ -21,7 +21,7 @@
import argparse import argparse
from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
from langchain import PromptTemplate, LLMChain from langchain import PromptTemplate, LLMChain
from langchain import HuggingFacePipeline from langchain import HuggingFacePipeline

View file

@ -25,7 +25,7 @@
import argparse import argparse
from langchain.chains import LLMMathChain from langchain.chains import LLMMathChain
from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
def main(args): def main(args):

View file

@ -30,8 +30,8 @@ from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks.manager import CallbackManager from langchain.callbacks.manager import CallbackManager
from bigdl.llm.langchain.llms import TransformersLLM from ipex_llm.langchain.llms import TransformersLLM
from bigdl.llm.langchain.embeddings import TransformersEmbeddings from ipex_llm.langchain.embeddings import TransformersEmbeddings
text_doc = ''' text_doc = '''
BigDL seamlessly scales your data analytics & AI applications from laptop to cloud, with the following libraries: BigDL seamlessly scales your data analytics & AI applications from laptop to cloud, with the following libraries:

View file

@ -23,9 +23,9 @@
from langchain import LLMChain, PromptTemplate from langchain import LLMChain, PromptTemplate
from bigdl.llm.langchain.llms import TransformersLLM from ipex_llm.langchain.llms import TransformersLLM
from langchain.memory import ConversationBufferWindowMemory from langchain.memory import ConversationBufferWindowMemory
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from transformers import WhisperProcessor from transformers import WhisperProcessor
import speech_recognition as sr import speech_recognition as sr
import numpy as np import numpy as np

Some files were not shown because too many files have changed in this diff Show more