Refactor bigdl.llm to ipex_llm (#24)

* Rename bigdl/llm to ipex_llm

* rm python/llm/src/bigdl

* from bigdl.llm to from ipex_llm
This commit is contained in:
Wang, Jian4 2024-03-22 15:41:21 +08:00 committed by GitHub
parent cc5806f4bc
commit 9df70d95eb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
464 changed files with 918 additions and 940 deletions

View file

@ -86,7 +86,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
```python
#load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
#run the optimized model on CPU
@ -113,7 +113,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
```python
#load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
#run the optimized model on Intel GPU

View file

@ -223,7 +223,7 @@ This controller manages the distributed workers.
##### Launch the model worker(s)
```bash
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
```
Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
@ -252,7 +252,7 @@ python3 -m fastchat.serve.controller
Then, launch the model worker(s):
```bash
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
```
Finally, launch the RESTful API server
@ -319,7 +319,7 @@ This controller manages the distributed workers.
##### Launch the model worker(s)
```bash
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
```
Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
@ -346,7 +346,7 @@ python3 -m fastchat.serve.controller
Then, launch the model worker(s):
```bash
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
```
Finally, launch the RESTful API server

View file

@ -23,7 +23,7 @@ from transformers import TextIteratorStreamer
from transformers.tools.agents import StopSequenceCriteria
from transformers.generation.stopping_criteria import StoppingCriteriaList
from colorama import Fore
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
SYSTEM_PROMPT = "A chat between a curious human <human> and an artificial intelligence assistant <bot>.\
The assistant gives helpful, detailed, and polite answers to the human's questions."
HUMAN_ID = "<human>"

View file

@ -135,9 +135,9 @@ else
done
if [ "$worker_type" == "model_worker" ]; then
worker_type="bigdl.llm.serving.model_worker"
worker_type="ipex_llm.serving.model_worker"
elif [ "$worker_type" == "vllm_worker" ]; then
worker_type="bigdl.llm.serving.vllm_worker"
worker_type="ipex_llm.serving.vllm_worker"
fi
if [[ -n $CONTROLLER_HOST ]]; then
@ -220,9 +220,9 @@ else
echo "Worker type: $worker_type"
echo "Worker address: $worker_address"
echo "Controller address: $controller_address"
if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then
if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then
elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
fi
fi

View file

@ -9,7 +9,7 @@
generation_config = GenerationConfig.from_pretrained(
model_path, trust_remote_code=True
)
+ from bigdl.llm.transformers import AutoModelForCausalLM
+ from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
model_path,
config=config,

View file

@ -66,9 +66,9 @@ else
done
if [ "$worker_type" == "model_worker" ]; then
worker_type="bigdl.llm.serving.model_worker"
worker_type="ipex_llm.serving.model_worker"
elif [ "$worker_type" == "vllm_worker" ]; then
worker_type="bigdl.llm.serving.vllm_worker"
worker_type="ipex_llm.serving.vllm_worker"
fi
if [[ -n $CONTROLLER_HOST ]]; then
@ -127,9 +127,9 @@ else
echo "Worker address: $worker_address"
echo "Controller address: $controller_address"
if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then
if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then
elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
fi
fi

View file

@ -21,7 +21,7 @@ To help you better understand the finetuning process, here we use model [Llama-2
First, load model using `transformers`-style API and **set it to `to('xpu')`**. We specify `load_in_low_bit="nf4"` here to apply 4-bit NormalFloat optimization. According to the [QLoRA paper](https://arxiv.org/pdf/2305.14314.pdf), using `"nf4"` could yield better model quality than `"int4"`.
```python
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",
load_in_low_bit="nf4",
@ -33,14 +33,14 @@ model = model.to('xpu')
Then, we have to apply some preprocessing to the model to prepare it for training.
```python
from bigdl.llm.transformers.qlora import prepare_model_for_kbit_training
from ipex_llm.transformers.qlora import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
```
Next, we can obtain a Peft model from the optimized model and a configuration object containing the parameters as follows:
```python
from bigdl.llm.transformers.qlora import get_peft_model
from ipex_llm.transformers.qlora import get_peft_model
from peft import LoraConfig
config = LoraConfig(r=8,
lora_alpha=32,
@ -54,7 +54,7 @@ model = get_peft_model(model, config)
```eval_rst
.. important::
Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``bigdl.llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``.
Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``ipex_llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``.
```
```eval_rst

View file

@ -5,7 +5,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
```python
# load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
```

View file

@ -29,7 +29,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
# Take Llama-2-7b-chat-hf as an example
from transformers import LlamaForCausalLM
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True)
model = optimize_model(model) # With only one line to enable BigDL-LLM INT4 optimization
@ -40,14 +40,14 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
When running LLMs on Intel iGPUs for Windows users, we recommend setting ``cpu_embedding=True`` in the ``optimize_model`` function. This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
See the `API doc <../../../PythonAPI/LLM/optimize.html#bigdl.llm.optimize_model>`_ for ``optimize_model`` to find more information.
See the `API doc <../../../PythonAPI/LLM/optimize.html#ipex_llm.optimize_model>`_ for ``optimize_model`` to find more information.
Especially, if you have saved the optimized model following setps `here <./optimize_model.html#save>`_, the loading process on Intel GPUs maybe as follows:
.. code-block:: python
from transformers import LlamaForCausalLM
from bigdl.llm.optimize import low_memory_init, load_low_bit
from ipex_llm.optimize import low_memory_init, load_low_bit
saved_dir='./llama-2-bigdl-llm-4-bit'
with low_memory_init(): # Fast and low cost by loading model on meta device
@ -65,7 +65,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
.. code-block:: python
# Take Llama-2-7b-chat-hf as an example
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
# Load model in 4 bit, which convert the relevant layers in the model into INT4 format
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', load_in_4bit=True)
@ -82,7 +82,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
.. code-block:: python
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
saved_dir='./llama-2-bigdl-llm-4-bit'
model = AutoModelForCausalLM.load_low_bit(saved_dir) # Load the optimized model

View file

@ -7,8 +7,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
```python
from bigdl.llm.langchain.llms import TransformersLLM
from bigdl.llm.langchain.embeddings import TransformersEmbeddings
from ipex_llm.langchain.llms import TransformersLLM
from ipex_llm.langchain.embeddings import TransformersEmbeddings
from langchain.chains.question_answering import load_qa_chain
embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
@ -37,8 +37,8 @@ You may also convert Hugging Face *Transformers* models into native INT4 format,
```
```python
from bigdl.llm.langchain.llms import LlamaLLM
from bigdl.llm.langchain.embeddings import LlamaEmbeddings
from ipex_llm.langchain.llms import LlamaLLM
from ipex_llm.langchain.embeddings import LlamaEmbeddings
from langchain.chains.question_answering import load_qa_chain
# switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models

View file

@ -10,13 +10,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 format
```python
# convert the model
from bigdl.llm import llm_convert
from ipex_llm import llm_convert
bigdl_llm_path = llm_convert(model='/path/to/model/',
outfile='/path/to/output/', outtype='int4', model_family="llama")
# load the converted model
# switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
from bigdl.llm.transformers import LlamaForCausalLM
from ipex_llm.transformers import LlamaForCausalLM
llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
# run the converted model

View file

@ -14,7 +14,7 @@ model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_
Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default:
```python
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
# With only one line to enable BigDL-LLM INT4 optimization
model = optimize_model(model)
@ -31,7 +31,7 @@ Currently, ``low_bit`` supports options 'sym_int4', 'asym_int4', 'sym_int5', 'as
You may apply symmetric INT8 optimization as follows:
```python
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
# Apply symmetric INT8 optimization
model = optimize_model(model, low_bit="sym_int8")
@ -51,7 +51,7 @@ model.save_low_bit(saved_dir)
We recommend to use the context manager `low_memory_init` to quickly initiate a model instance with low cost, and then use `load_low_bit` to load the optimized low-bit model as follows:
```python
from bigdl.llm.optimize import low_memory_init, load_low_bit
from ipex_llm.optimize import low_memory_init, load_low_bit
with low_memory_init(): # Fast and low cost by loading model on meta device
model = LlamaForCausalLM.from_pretrained(saved_dir,
torch_dtype="auto",

View file

@ -11,7 +11,7 @@ Here, let's take a relatively small LLM model, i.e [open_llama_3b_v2](https://hu
Simply use one-line `transformers`-style API in `bigdl-llm` to load `open_llama_3b_v2` with INT4 optimization (by specifying `load_in_4bit=True`) as follows:
```python
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="openlm-research/open_llama_3b_v2",
load_in_4bit=True)

View file

@ -112,7 +112,7 @@ Install the Miniconda as follows if you don't have conda installed on your machi
python
> from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
> from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
```
> <img src="https://llm-assets.readthedocs.io/en/latest/_images/verify_bigdl_import.png" alt="image-20240221102252562" width=100%; />
@ -170,7 +170,7 @@ Now let's play with a real LLM. We'll be using the [phi-1.5](https://huggingface
```python
# Copy/Paste the contents to a new file demo.py
import torch
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig
generation_config = GenerationConfig(use_cache = True)

View file

@ -130,7 +130,7 @@ You can verify if `bigdl-llm` is successfully installed by simply running a few
* Step 5: Copy following code to Anaconda prompt **line by line** and press Enter **after copying each line**.
```python
import torch
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
tensor_1 = torch.randn(1, 1, 40, 128).to('xpu')
tensor_2 = torch.randn(1, 1, 128, 40).to('xpu')
print(torch.matmul(tensor_1, tensor_2).size())
@ -200,7 +200,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
# Copy/Paste the contents to a new file demo.py
import torch
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig
generation_config = GenerationConfig(use_cache=True)
@ -260,7 +260,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
# Copy/Paste the contents to a new file demo.py
import torch
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import GenerationConfig
from modelscope import AutoTokenizer
generation_config = GenerationConfig(use_cache=True)

View file

@ -13,7 +13,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im
.. tab:: AutoModel
.. automodule:: bigdl.llm.langchain.llms.transformersllm
.. automodule:: ipex_llm.langchain.llms.transformersllm
:members:
:undoc-members:
:show-inheritance:
@ -21,7 +21,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im
.. tab:: pipeline
.. automodule:: bigdl.llm.langchain.llms.transformerspipelinellm
.. automodule:: ipex_llm.langchain.llms.transformerspipelinellm
:members:
:undoc-members:
:show-inheritance:
@ -37,7 +37,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Llama
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.LlamaLLM
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.LlamaLLM
:members:
:undoc-members:
:show-inheritance:
@ -49,7 +49,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: ChatGLM
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.ChatGLMLLM
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.ChatGLMLLM
:members:
:undoc-members:
:show-inheritance:
@ -61,7 +61,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Bloom
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.BloomLLM
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.BloomLLM
:members:
:undoc-members:
:show-inheritance:
@ -73,7 +73,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Gptneox
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.GptneoxLLM
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.GptneoxLLM
:members:
:undoc-members:
:show-inheritance:
@ -85,7 +85,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Starcoder
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.StarcoderLLM
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.StarcoderLLM
:members:
:undoc-members:
:show-inheritance:
@ -102,7 +102,7 @@ Embeddings Wrapper of LangChain
Hugging Face ``transformers`` AutoModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. automodule:: bigdl.llm.langchain.embeddings.transformersembeddings
.. automodule:: ipex_llm.langchain.embeddings.transformersembeddings
:members:
:undoc-members:
:show-inheritance:
@ -117,7 +117,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
.. tab:: Llama
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.LlamaEmbeddings
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.LlamaEmbeddings
:members:
:undoc-members:
:show-inheritance:
@ -129,7 +129,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
.. tab:: Bloom
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.BloomEmbeddings
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.BloomEmbeddings
:members:
:undoc-members:
:show-inheritance:
@ -141,7 +141,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
.. tab:: Gptneox
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings
:members:
:undoc-members:
:show-inheritance:
@ -153,7 +153,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
.. tab:: Starcoder
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings
:members:
:undoc-members:
:show-inheritance:

View file

@ -6,7 +6,7 @@ Optimize Model
You can run any PyTorch model with ``optimize_model`` through only one-line code change to benefit from BigDL-LLM optimization, regardless of the library or API you are using.
.. automodule:: bigdl.llm
.. automodule:: ipex_llm
:members: optimize_model
:undoc-members:
:show-inheritance:
@ -18,7 +18,7 @@ Load Optimized Model
To avoid high resource consumption during the loading processes of the original model, we provide save/load API to support the saving of model after low-bit optimization and the loading of the saved low-bit model. Saving and loading operations are platform-independent, regardless of their operating systems.
.. automodule:: bigdl.llm.optimize
.. automodule:: ipex_llm.optimize
:members: load_low_bit
:undoc-members:
:show-inheritance:

View file

@ -10,7 +10,7 @@ You can apply BigDL-LLM optimizations on any Hugging Face Transformers models by
AutoModelForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: bigdl.llm.transformers.AutoModelForCausalLM
.. autoclass:: ipex_llm.transformers.AutoModelForCausalLM
:members:
:undoc-members:
:show-inheritance:
@ -22,7 +22,7 @@ AutoModelForCausalLM
AutoModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: bigdl.llm.transformers.AutoModel
.. autoclass:: ipex_llm.transformers.AutoModel
:members:
:undoc-members:
:show-inheritance:
@ -34,7 +34,7 @@ AutoModel
AutoModelForSpeechSeq2Seq
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: bigdl.llm.transformers.AutoModelForSpeechSeq2Seq
.. autoclass:: ipex_llm.transformers.AutoModelForSpeechSeq2Seq
:members:
:undoc-members:
:show-inheritance:
@ -46,7 +46,7 @@ AutoModelForSpeechSeq2Seq
AutoModelForSeq2SeqLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: bigdl.llm.transformers.AutoModelForSeq2SeqLM
.. autoclass:: ipex_llm.transformers.AutoModelForSeq2SeqLM
:members:
:undoc-members:
:show-inheritance:
@ -67,7 +67,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Llama
.. autoclass:: bigdl.llm.transformers.LlamaForCausalLM
.. autoclass:: ipex_llm.transformers.LlamaForCausalLM
:members:
:undoc-members:
:show-inheritance:
@ -77,7 +77,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: ChatGLM
.. autoclass:: bigdl.llm.transformers.ChatGLMForCausalLM
.. autoclass:: ipex_llm.transformers.ChatGLMForCausalLM
:members:
:undoc-members:
:show-inheritance:
@ -87,7 +87,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Gptneox
.. autoclass:: bigdl.llm.transformers.GptneoxForCausalLM
.. autoclass:: ipex_llm.transformers.GptneoxForCausalLM
:members:
:undoc-members:
:show-inheritance:
@ -96,7 +96,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. automethod:: from_pretrained
.. tab:: Bloom
.. autoclass:: bigdl.llm.transformers.BloomForCausalLM
.. autoclass:: ipex_llm.transformers.BloomForCausalLM
:members:
:undoc-members:
:show-inheritance:
@ -106,7 +106,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
.. tab:: Starcoder
.. autoclass:: bigdl.llm.transformers.StarcoderForCausalLM
.. autoclass:: ipex_llm.transformers.StarcoderForCausalLM
:members:
:undoc-members:
:show-inheritance:

View file

@ -112,7 +112,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models
.. code-block:: python
#load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
#run the optimized model on Intel CPU
@ -146,7 +146,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models
.. code-block:: python
#load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
#run the optimized model on Intel GPU

View file

@ -146,7 +146,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int
```python
#load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
#run the optimized model on Intel CPU
@ -164,7 +164,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int
```python
#load Hugging Face Transformers model with INT4 optimizations
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
import intel_extension_for_pytorch
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
@ -206,13 +206,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 model f
```python
#convert the model
from bigdl.llm import llm_convert
from ipex_llm import llm_convert
bigdl_llm_path = llm_convert(model='/path/to/model/',
outfile='/path/to/output/', outtype='int4', model_family="llama")
#load the converted model
#switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
from bigdl.llm.transformers import LlamaForCausalLM
from ipex_llm.transformers import LlamaForCausalLM
llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
#run the converted model
@ -231,8 +231,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
```python
from bigdl.llm.langchain.llms import TransformersLLM
from bigdl.llm.langchain.embeddings import TransformersEmbeddings
from ipex_llm.langchain.llms import TransformersLLM
from ipex_llm.langchain.embeddings import TransformersEmbeddings
from langchain.chains.question_answering import load_qa_chain
embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
@ -250,8 +250,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
>**Notes**:* Currently only llama/bloom/gptneox/starcoder/chatglm model families are supported; for other models, you may use the Hugging Face `transformers` model format as described above).
```python
from bigdl.llm.langchain.llms import LlamaLLM
from bigdl.llm.langchain.embeddings import LlamaEmbeddings
from ipex_llm.langchain.llms import LlamaLLM
from ipex_llm.langchain.embeddings import LlamaEmbeddings
from langchain.chains.question_answering import load_qa_chain
#switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models

View file

@ -7,7 +7,7 @@ Just put this file into your benchmark directory, and then wrap your transformer
Take `chatglm-6b` as an example:
```python
import torch
from bigdl.llm.transformers import AutoModel
from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer
from benchmark_util import BenchmarkWrapper
@ -35,7 +35,7 @@ Take `chatglm-6b` as an example:
```python
import torch
import intel_extension_for_pytorch as ipex
from bigdl.llm.transformers import AutoModel
from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer
from benchmark_util import BenchmarkWrapper

View file

@ -31,7 +31,7 @@ benchmark_util_path = os.path.join(current_dir, '..')
import sys
sys.path.append(benchmark_util_path)
from benchmark_util import BenchmarkWrapper
from bigdl.llm.utils.common.log4Error import invalidInputError
from ipex_llm.utils.common.log4Error import invalidInputError
LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
@ -85,7 +85,7 @@ def run_transformer_int4(repo_id,
num_trials,
num_beams,
low_bit):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)
@ -149,7 +149,7 @@ def run_transformer_int4_gpu(repo_id,
num_trials,
num_beams,
low_bit):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex
reserved_mem_list = []

View file

@ -32,7 +32,7 @@ benchmark_util_path = os.path.join(current_dir, '..')
import sys
sys.path.append(benchmark_util_path)
from benchmark_util import BenchmarkWrapper
from bigdl.llm.utils.common.log4Error import invalidInputError
from ipex_llm.utils.common.log4Error import invalidInputError
LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
@ -143,8 +143,8 @@ def run_native_int4(repo_id,
warm_up,
num_trials):
model_path = get_model_path(repo_id, local_model_hub)
from bigdl.llm.transformers import BigdlNativeForCausalLM
from bigdl.llm import llm_convert
from ipex_llm.transformers import BigdlNativeForCausalLM
from ipex_llm import llm_convert
if "chatglm" in repo_id.lower():
family = "chatglm"
elif "llama" in repo_id.lower():
@ -184,7 +184,7 @@ def run_transformer_int4(repo_id,
num_beams,
low_bit,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)
@ -319,7 +319,7 @@ def run_optimize_model(repo_id,
low_bit,
batch_size):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit,
@ -389,7 +389,7 @@ def run_transformer_int4_gpu(repo_id,
num_beams,
low_bit,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub)
@ -495,7 +495,7 @@ def run_optimize_model_gpu(repo_id,
low_bit,
batch_size):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit,
@ -651,7 +651,7 @@ def run_bigdl_fp16_gpu(repo_id,
num_trials,
num_beams,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub)
@ -731,7 +731,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
batch_size):
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
import deepspeed
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
import argparse
# parser is for deepspeed subprocesses' inline parameter
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
@ -822,7 +822,7 @@ def run_transformer_int4_gpu_win(repo_id,
cpu_embedding,
batch_size,
streaming):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub)
@ -928,7 +928,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
cpu_embedding,
batch_size,
streaming):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub)
@ -1038,7 +1038,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
cpu_embedding,
batch_size,
streaming):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub)
@ -1140,7 +1140,7 @@ def run_transformer_autocast_bf16( repo_id,
num_trials,
num_beams,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)
@ -1209,7 +1209,7 @@ def run_bigdl_ipex_bf16(repo_id,
num_trials,
num_beams,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
os.environ["BIGDL_OPT_IPEX"] = "true"
@ -1280,7 +1280,7 @@ def run_bigdl_ipex_int4(repo_id,
num_trials,
num_beams,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
os.environ["BIGDL_OPT_IPEX"] = "true"
@ -1350,7 +1350,7 @@ def run_bigdl_ipex_int8(repo_id,
num_trials,
num_beams,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
os.environ["BIGDL_OPT_IPEX"] = "true"
@ -1434,7 +1434,7 @@ def run_deepspeed_optimize_model_gpu(repo_id,
os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500")
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
import intel_extension_for_pytorch as ipex
import deepspeed
from deepspeed.accelerator.cpu_accelerator import CPU_Accelerator
@ -1535,9 +1535,9 @@ def run_speculative_cpu(repo_id,
num_trials,
num_beams,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
from bigdl.llm.transformers.convert import get_enable_ipex
from ipex_llm.transformers.convert import get_enable_ipex
_enable_ipex = get_enable_ipex()
@ -1615,7 +1615,7 @@ def run_speculative_gpu(repo_id,
num_trials,
num_beams,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)

View file

@ -30,7 +30,7 @@ current_dir = os.path.dirname(os.path.realpath(__file__))
def save_model_in_low_bit(repo_id,
local_model_hub,
low_bit):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit,

View file

@ -21,7 +21,7 @@ import torch
import json
from tqdm import tqdm
from bigdl.llm.utils.common.log4Error import invalidInputError
from ipex_llm.utils.common.log4Error import invalidInputError
from evaluators.qwen import QwenEvaluator
from evaluators.llama import LlamaEvaluator
from evaluators.chatglm import ChatGLMEvaluator

View file

@ -22,7 +22,7 @@ from thefuzz import process
from transformers import AutoTokenizer
from evaluators.evaluator import Evaluator
from bigdl.llm.transformers import AutoModel
from ipex_llm.transformers import AutoModel
from transformers.generation.utils import LogitsProcessorList
from transformers.generation.logits_process import LogitsProcessor

View file

@ -22,7 +22,7 @@ import numpy as np
import torch
from transformers import LlamaTokenizer, GenerationConfig
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from evaluators.evaluator import Evaluator

View file

@ -22,7 +22,7 @@ from thefuzz import process
from transformers import AutoTokenizer
from transformers.generation import GenerationConfig
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from evaluators.evaluator import Evaluator

View file

@ -14,7 +14,7 @@
# limitations under the License.
#
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
import inspect
from lm_eval.models.huggingface import AutoCausalLM

View file

@ -20,7 +20,7 @@ from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import gc
from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
class BigDLPPL:
def __init__(self, model_path, device, **model_kwargs) -> None:

View file

@ -21,7 +21,7 @@ from datasets import concatenate_datasets, load_dataset
from transformers import AutoTokenizer
from ppl import BigDLPPL
from bigdl.llm.ggml.quantize import ggml_tensor_qtype
from ipex_llm.ggml.quantize import ggml_tensor_qtype
import os
import json

View file

@ -15,7 +15,7 @@
#
from datasets import load_dataset
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from transformers import WhisperProcessor
import torch
from evaluate import load

View file

@ -69,11 +69,11 @@ conda activate autogen
cd autogen
# load the local model with cpu with your downloaded model
python -m bigdl.llm.serving.model_worker --model-path ... --device cpu
python -m ipex_llm.serving.model_worker --model-path ... --device cpu
```
Change the Model Name:
> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m bigdl.llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat.
> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m ipex_llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat.
Potential Error Note:
> If you get `RuntimeError: Error register to Controller` in the worker terminal, please set `export no_proxy='localhost'` to ensure the registration

View file

@ -19,7 +19,7 @@ import argparse
from PIL import Image
from transformers import AutoTokenizer, LocalAgent
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run agent using vicuna model")

View file

@ -3,7 +3,7 @@
In this example, we apply low-bit optimizations to [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs.
Only one code change is needed to load the model using bigdl-llm as follows:
```python
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
```

View file

@ -49,7 +49,7 @@ import urllib.request
import os
import json
# code change to import from bigdl-llm API instead of using transformers API
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer
import intel_extension_for_pytorch as ipex

View file

@ -39,7 +39,7 @@ Distributed model managed by deepspeed can be further optimized with BigDL low-b
```python
# Apply BigDL-LLM INT4 optimizations on transformers
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
model = model.to(f'cpu:{local_rank}') # move partial model to local rank

View file

@ -45,7 +45,7 @@ import os
import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
import deepspeed
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
import torch
import intel_extension_for_pytorch as ipex
import time

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
# you could tune the prompt based on your own model,
# here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer, GPTQConfig
# you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModel
from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModel
from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModel
from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModel
from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModel
from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import CodeLlamaTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
from transformers import AutoTokenizer
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
# you could tune the prompt based on your own model,
# here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template

View file

@ -36,7 +36,7 @@ if __name__ == '__main__':
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True).eval()
model.generation_config = GenerationConfig.from_pretrained(model_path)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -17,7 +17,7 @@
import time
import argparse
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from datasets import load_dataset
from transformers import pipeline
from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -301,7 +301,7 @@ class Attention(nn.Module):
# resize qk to 4D to match apply_rotary_pos_emb_no_cache_xpu's requirements.
query_layer = query_layer.reshape(batch_size, self.num_heads, q_length, self.head_dim)
key_layer = key_layer.reshape(batch_size, self.num_kv, q_length, self.head_dim)
from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
query_layer, key_layer = apply_rotary_pos_emb_no_cache_xpu(query_layer,
key_layer,
position_ids,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModelForSeq2SeqLM
from ipex_llm.transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import torch
import argparse
import time
from PIL import Image
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Fuyu model')

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# The instruction-tuned models use a chat template that must be adhered to for conversational use.

View file

@ -14,14 +14,14 @@
# limitations under the License.
#
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers.generation import GenerationConfig
import torch
import time
import os
import argparse
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for InternLM-XComposer model')

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -39,7 +39,7 @@ if __name__ == '__main__':
model_path = args.repo_id_or_model_path
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
trust_remote_code=True)

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer
# you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig
# you could tune the prompt based on your own model,

View file

@ -20,7 +20,7 @@ import argparse
import numpy as np
from transformers import AutoTokenizer, GenerationConfig
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
# you could tune the prompt based on your own model,
# here the prompt tuning refers to # TODO: https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py
PHI1_5_PROMPT_FORMAT = " Question:{prompt}\n\n Answer:"
@ -41,7 +41,7 @@ if __name__ == '__main__':
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
trust_remote_code=True)

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could revise it based on the Phoenix model you choose to use

View file

@ -14,14 +14,14 @@
# limitations under the License.
#
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
from transformers.generation import GenerationConfig
import torch
import time
import os
import argparse
from bigdl.llm import optimize_model
from ipex_llm import optimize_model
torch.manual_seed(1234)
if __name__ == '__main__':

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model

View file

@ -36,7 +36,7 @@ if __name__ == '__main__':
model_path = args.repo_id_or_model_path
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
trust_remote_code=True)

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# you could tune the prompt based on your own model,

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer
# you could tune the prompt based on your own model,

View file

@ -19,7 +19,7 @@ import librosa
import argparse
from transformers import pipeline
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from transformers import WhisperProcessor
from datasets import load_dataset

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer
WIZARDCODERPYTHON_PROMPT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

View file

@ -18,7 +18,7 @@ import torch
import time
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model

View file

@ -18,7 +18,7 @@ import torch, transformers
import sys, os, time
import argparse
from transformers import LlamaTokenizer
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
# Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage
YUAN2_PROMPT_FORMAT = """

View file

@ -39,7 +39,7 @@ if __name__ == '__main__':
model_path = args.repo_id_or_model_path
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
# enabling `use_cache=True` allows the model to utilize the previous
# key/values attentions to speed up decoding;
# to obtain optimal performance with BigDL-LLM INT4 optimizations,

View file

@ -15,7 +15,7 @@
#
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer, TextGenerationPipeline
if __name__ == '__main__':
@ -38,7 +38,7 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.load_low_bit(load_path)
tokenizer = LlamaTokenizer.from_pretrained(load_path)
else:
# load_in_low_bit in bigdl.llm.transformers will convert
# load_in_low_bit in ipex_llm.transformers will convert
# the relevant layers in the model into corresponding int X format
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)

View file

@ -15,7 +15,7 @@
#
import argparse
from bigdl.llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer, TextGenerationPipeline
if __name__ == '__main__':
@ -38,7 +38,7 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.load_low_bit(load_path)
tokenizer = LlamaTokenizer.from_pretrained(load_path)
else:
# load_in_low_bit in bigdl.llm.transformers will convert
# load_in_low_bit in ipex_llm.transformers will convert
# the relevant layers in the model into corresponding int X format
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)

View file

@ -31,8 +31,8 @@ from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from bigdl.llm.langchain.llms import *
from bigdl.llm.langchain.embeddings import *
from ipex_llm.langchain.llms import *
from ipex_llm.langchain.embeddings import *
def main(args):

View file

@ -21,7 +21,7 @@
import argparse
from bigdl.llm.langchain.llms import *
from ipex_llm.langchain.llms import *
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

View file

@ -23,7 +23,7 @@
from langchain import LLMChain, PromptTemplate
from bigdl.llm.langchain.llms import *
from ipex_llm.langchain.llms import *
from langchain.memory import ConversationBufferWindowMemory
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

View file

@ -21,7 +21,7 @@
import argparse
from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
from langchain import PromptTemplate, LLMChain
from langchain import HuggingFacePipeline

View file

@ -25,7 +25,7 @@
import argparse
from langchain.chains import LLMMathChain
from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
def main(args):

View file

@ -30,8 +30,8 @@ from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks.manager import CallbackManager
from bigdl.llm.langchain.llms import TransformersLLM
from bigdl.llm.langchain.embeddings import TransformersEmbeddings
from ipex_llm.langchain.llms import TransformersLLM
from ipex_llm.langchain.embeddings import TransformersEmbeddings
text_doc = '''
BigDL seamlessly scales your data analytics & AI applications from laptop to cloud, with the following libraries:

View file

@ -23,9 +23,9 @@
from langchain import LLMChain, PromptTemplate
from bigdl.llm.langchain.llms import TransformersLLM
from ipex_llm.langchain.llms import TransformersLLM
from langchain.memory import ConversationBufferWindowMemory
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from transformers import WhisperProcessor
import speech_recognition as sr
import numpy as np

Some files were not shown because too many files have changed in this diff Show more