Refactor bigdl.llm to ipex_llm (#24)
* Rename bigdl/llm to ipex_llm * rm python/llm/src/bigdl * from bigdl.llm to from ipex_llm
This commit is contained in:
parent
cc5806f4bc
commit
9df70d95eb
464 changed files with 918 additions and 940 deletions
|
|
@ -86,7 +86,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
|
|||
|
||||
```python
|
||||
#load Hugging Face Transformers model with INT4 optimizations
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||
|
||||
#run the optimized model on CPU
|
||||
|
|
@ -113,7 +113,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
|
|||
|
||||
```python
|
||||
#load Hugging Face Transformers model with INT4 optimizations
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||
|
||||
#run the optimized model on Intel GPU
|
||||
|
|
|
|||
|
|
@ -223,7 +223,7 @@ This controller manages the distributed workers.
|
|||
|
||||
##### Launch the model worker(s)
|
||||
```bash
|
||||
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
|
||||
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
|
||||
```
|
||||
Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
|
||||
|
||||
|
|
@ -252,7 +252,7 @@ python3 -m fastchat.serve.controller
|
|||
Then, launch the model worker(s):
|
||||
|
||||
```bash
|
||||
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
|
||||
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
|
||||
```
|
||||
|
||||
Finally, launch the RESTful API server
|
||||
|
|
@ -319,7 +319,7 @@ This controller manages the distributed workers.
|
|||
|
||||
##### Launch the model worker(s)
|
||||
```bash
|
||||
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
|
||||
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
|
||||
```
|
||||
Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
|
||||
|
||||
|
|
@ -346,7 +346,7 @@ python3 -m fastchat.serve.controller
|
|||
Then, launch the model worker(s):
|
||||
|
||||
```bash
|
||||
python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
|
||||
python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
|
||||
```
|
||||
|
||||
Finally, launch the RESTful API server
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ from transformers import TextIteratorStreamer
|
|||
from transformers.tools.agents import StopSequenceCriteria
|
||||
from transformers.generation.stopping_criteria import StoppingCriteriaList
|
||||
from colorama import Fore
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
SYSTEM_PROMPT = "A chat between a curious human <human> and an artificial intelligence assistant <bot>.\
|
||||
The assistant gives helpful, detailed, and polite answers to the human's questions."
|
||||
HUMAN_ID = "<human>"
|
||||
|
|
|
|||
|
|
@ -135,9 +135,9 @@ else
|
|||
done
|
||||
|
||||
if [ "$worker_type" == "model_worker" ]; then
|
||||
worker_type="bigdl.llm.serving.model_worker"
|
||||
worker_type="ipex_llm.serving.model_worker"
|
||||
elif [ "$worker_type" == "vllm_worker" ]; then
|
||||
worker_type="bigdl.llm.serving.vllm_worker"
|
||||
worker_type="ipex_llm.serving.vllm_worker"
|
||||
fi
|
||||
|
||||
if [[ -n $CONTROLLER_HOST ]]; then
|
||||
|
|
@ -220,9 +220,9 @@ else
|
|||
echo "Worker type: $worker_type"
|
||||
echo "Worker address: $worker_address"
|
||||
echo "Controller address: $controller_address"
|
||||
if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then
|
||||
if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
|
||||
python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
|
||||
elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then
|
||||
elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
|
||||
python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
|
||||
fi
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@
|
|||
generation_config = GenerationConfig.from_pretrained(
|
||||
model_path, trust_remote_code=True
|
||||
)
|
||||
+ from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
+ from ipex_llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
config=config,
|
||||
|
|
|
|||
|
|
@ -66,9 +66,9 @@ else
|
|||
done
|
||||
|
||||
if [ "$worker_type" == "model_worker" ]; then
|
||||
worker_type="bigdl.llm.serving.model_worker"
|
||||
worker_type="ipex_llm.serving.model_worker"
|
||||
elif [ "$worker_type" == "vllm_worker" ]; then
|
||||
worker_type="bigdl.llm.serving.vllm_worker"
|
||||
worker_type="ipex_llm.serving.vllm_worker"
|
||||
fi
|
||||
|
||||
if [[ -n $CONTROLLER_HOST ]]; then
|
||||
|
|
@ -127,9 +127,9 @@ else
|
|||
echo "Worker address: $worker_address"
|
||||
echo "Controller address: $controller_address"
|
||||
|
||||
if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then
|
||||
if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
|
||||
python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
|
||||
elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then
|
||||
elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
|
||||
python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
|
||||
fi
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ To help you better understand the finetuning process, here we use model [Llama-2
|
|||
First, load model using `transformers`-style API and **set it to `to('xpu')`**. We specify `load_in_low_bit="nf4"` here to apply 4-bit NormalFloat optimization. According to the [QLoRA paper](https://arxiv.org/pdf/2305.14314.pdf), using `"nf4"` could yield better model quality than `"int4"`.
|
||||
|
||||
```python
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",
|
||||
load_in_low_bit="nf4",
|
||||
|
|
@ -33,14 +33,14 @@ model = model.to('xpu')
|
|||
|
||||
Then, we have to apply some preprocessing to the model to prepare it for training.
|
||||
```python
|
||||
from bigdl.llm.transformers.qlora import prepare_model_for_kbit_training
|
||||
from ipex_llm.transformers.qlora import prepare_model_for_kbit_training
|
||||
model.gradient_checkpointing_enable()
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
```
|
||||
|
||||
Next, we can obtain a Peft model from the optimized model and a configuration object containing the parameters as follows:
|
||||
```python
|
||||
from bigdl.llm.transformers.qlora import get_peft_model
|
||||
from ipex_llm.transformers.qlora import get_peft_model
|
||||
from peft import LoraConfig
|
||||
config = LoraConfig(r=8,
|
||||
lora_alpha=32,
|
||||
|
|
@ -54,7 +54,7 @@ model = get_peft_model(model, config)
|
|||
```eval_rst
|
||||
.. important::
|
||||
|
||||
Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``bigdl.llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``.
|
||||
Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``ipex_llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``.
|
||||
```
|
||||
|
||||
```eval_rst
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
|
|||
|
||||
```python
|
||||
# load Hugging Face Transformers model with INT4 optimizations
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||
```
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
|
|||
|
||||
# Take Llama-2-7b-chat-hf as an example
|
||||
from transformers import LlamaForCausalLM
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
|
||||
model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True)
|
||||
model = optimize_model(model) # With only one line to enable BigDL-LLM INT4 optimization
|
||||
|
|
@ -40,14 +40,14 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
|
|||
|
||||
When running LLMs on Intel iGPUs for Windows users, we recommend setting ``cpu_embedding=True`` in the ``optimize_model`` function. This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
|
||||
|
||||
See the `API doc <../../../PythonAPI/LLM/optimize.html#bigdl.llm.optimize_model>`_ for ``optimize_model`` to find more information.
|
||||
See the `API doc <../../../PythonAPI/LLM/optimize.html#ipex_llm.optimize_model>`_ for ``optimize_model`` to find more information.
|
||||
|
||||
Especially, if you have saved the optimized model following setps `here <./optimize_model.html#save>`_, the loading process on Intel GPUs maybe as follows:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from transformers import LlamaForCausalLM
|
||||
from bigdl.llm.optimize import low_memory_init, load_low_bit
|
||||
from ipex_llm.optimize import low_memory_init, load_low_bit
|
||||
|
||||
saved_dir='./llama-2-bigdl-llm-4-bit'
|
||||
with low_memory_init(): # Fast and low cost by loading model on meta device
|
||||
|
|
@ -65,7 +65,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
|
|||
.. code-block:: python
|
||||
|
||||
# Take Llama-2-7b-chat-hf as an example
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
# Load model in 4 bit, which convert the relevant layers in the model into INT4 format
|
||||
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', load_in_4bit=True)
|
||||
|
|
@ -82,7 +82,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
saved_dir='./llama-2-bigdl-llm-4-bit'
|
||||
model = AutoModelForCausalLM.load_low_bit(saved_dir) # Load the optimized model
|
||||
|
|
|
|||
|
|
@ -7,8 +7,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
|
|||
You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
|
||||
|
||||
```python
|
||||
from bigdl.llm.langchain.llms import TransformersLLM
|
||||
from bigdl.llm.langchain.embeddings import TransformersEmbeddings
|
||||
from ipex_llm.langchain.llms import TransformersLLM
|
||||
from ipex_llm.langchain.embeddings import TransformersEmbeddings
|
||||
from langchain.chains.question_answering import load_qa_chain
|
||||
|
||||
embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
|
||||
|
|
@ -37,8 +37,8 @@ You may also convert Hugging Face *Transformers* models into native INT4 format,
|
|||
```
|
||||
|
||||
```python
|
||||
from bigdl.llm.langchain.llms import LlamaLLM
|
||||
from bigdl.llm.langchain.embeddings import LlamaEmbeddings
|
||||
from ipex_llm.langchain.llms import LlamaLLM
|
||||
from ipex_llm.langchain.embeddings import LlamaEmbeddings
|
||||
from langchain.chains.question_answering import load_qa_chain
|
||||
|
||||
# switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
|
||||
|
|
|
|||
|
|
@ -10,13 +10,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 format
|
|||
|
||||
```python
|
||||
# convert the model
|
||||
from bigdl.llm import llm_convert
|
||||
from ipex_llm import llm_convert
|
||||
bigdl_llm_path = llm_convert(model='/path/to/model/',
|
||||
outfile='/path/to/output/', outtype='int4', model_family="llama")
|
||||
|
||||
# load the converted model
|
||||
# switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
|
||||
from bigdl.llm.transformers import LlamaForCausalLM
|
||||
from ipex_llm.transformers import LlamaForCausalLM
|
||||
llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
|
||||
|
||||
# run the converted model
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_
|
|||
|
||||
Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default:
|
||||
```python
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
|
||||
# With only one line to enable BigDL-LLM INT4 optimization
|
||||
model = optimize_model(model)
|
||||
|
|
@ -31,7 +31,7 @@ Currently, ``low_bit`` supports options 'sym_int4', 'asym_int4', 'sym_int5', 'as
|
|||
You may apply symmetric INT8 optimization as follows:
|
||||
|
||||
```python
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
|
||||
# Apply symmetric INT8 optimization
|
||||
model = optimize_model(model, low_bit="sym_int8")
|
||||
|
|
@ -51,7 +51,7 @@ model.save_low_bit(saved_dir)
|
|||
|
||||
We recommend to use the context manager `low_memory_init` to quickly initiate a model instance with low cost, and then use `load_low_bit` to load the optimized low-bit model as follows:
|
||||
```python
|
||||
from bigdl.llm.optimize import low_memory_init, load_low_bit
|
||||
from ipex_llm.optimize import low_memory_init, load_low_bit
|
||||
with low_memory_init(): # Fast and low cost by loading model on meta device
|
||||
model = LlamaForCausalLM.from_pretrained(saved_dir,
|
||||
torch_dtype="auto",
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ Here, let's take a relatively small LLM model, i.e [open_llama_3b_v2](https://hu
|
|||
Simply use one-line `transformers`-style API in `bigdl-llm` to load `open_llama_3b_v2` with INT4 optimization (by specifying `load_in_4bit=True`) as follows:
|
||||
|
||||
```python
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="openlm-research/open_llama_3b_v2",
|
||||
load_in_4bit=True)
|
||||
|
|
|
|||
|
|
@ -112,7 +112,7 @@ Install the Miniconda as follows if you don't have conda installed on your machi
|
|||
|
||||
python
|
||||
|
||||
> from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
> from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
```
|
||||
|
||||
> <img src="https://llm-assets.readthedocs.io/en/latest/_images/verify_bigdl_import.png" alt="image-20240221102252562" width=100%; />
|
||||
|
|
@ -170,7 +170,7 @@ Now let's play with a real LLM. We'll be using the [phi-1.5](https://huggingface
|
|||
```python
|
||||
# Copy/Paste the contents to a new file demo.py
|
||||
import torch
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GenerationConfig
|
||||
generation_config = GenerationConfig(use_cache = True)
|
||||
|
||||
|
|
|
|||
|
|
@ -130,7 +130,7 @@ You can verify if `bigdl-llm` is successfully installed by simply running a few
|
|||
* Step 5: Copy following code to Anaconda prompt **line by line** and press Enter **after copying each line**.
|
||||
```python
|
||||
import torch
|
||||
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
|
||||
tensor_1 = torch.randn(1, 1, 40, 128).to('xpu')
|
||||
tensor_2 = torch.randn(1, 1, 128, 40).to('xpu')
|
||||
print(torch.matmul(tensor_1, tensor_2).size())
|
||||
|
|
@ -200,7 +200,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
|
|||
|
||||
# Copy/Paste the contents to a new file demo.py
|
||||
import torch
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GenerationConfig
|
||||
generation_config = GenerationConfig(use_cache=True)
|
||||
|
||||
|
|
@ -260,7 +260,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
|
|||
|
||||
# Copy/Paste the contents to a new file demo.py
|
||||
import torch
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import GenerationConfig
|
||||
from modelscope import AutoTokenizer
|
||||
generation_config = GenerationConfig(use_cache=True)
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im
|
|||
|
||||
.. tab:: AutoModel
|
||||
|
||||
.. automodule:: bigdl.llm.langchain.llms.transformersllm
|
||||
.. automodule:: ipex_llm.langchain.llms.transformersllm
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -21,7 +21,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im
|
|||
|
||||
.. tab:: pipeline
|
||||
|
||||
.. automodule:: bigdl.llm.langchain.llms.transformerspipelinellm
|
||||
.. automodule:: ipex_llm.langchain.llms.transformerspipelinellm
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -37,7 +37,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
|||
|
||||
.. tab:: Llama
|
||||
|
||||
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.LlamaLLM
|
||||
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.LlamaLLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -49,7 +49,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
|||
|
||||
.. tab:: ChatGLM
|
||||
|
||||
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.ChatGLMLLM
|
||||
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.ChatGLMLLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -61,7 +61,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
|||
|
||||
.. tab:: Bloom
|
||||
|
||||
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.BloomLLM
|
||||
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.BloomLLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -73,7 +73,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
|||
|
||||
.. tab:: Gptneox
|
||||
|
||||
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.GptneoxLLM
|
||||
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.GptneoxLLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -85,7 +85,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
|||
|
||||
.. tab:: Starcoder
|
||||
|
||||
.. autoclass:: bigdl.llm.langchain.llms.bigdlllm.StarcoderLLM
|
||||
.. autoclass:: ipex_llm.langchain.llms.bigdlllm.StarcoderLLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -102,7 +102,7 @@ Embeddings Wrapper of LangChain
|
|||
Hugging Face ``transformers`` AutoModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. automodule:: bigdl.llm.langchain.embeddings.transformersembeddings
|
||||
.. automodule:: ipex_llm.langchain.embeddings.transformersembeddings
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -117,7 +117,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
|
|||
|
||||
.. tab:: Llama
|
||||
|
||||
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.LlamaEmbeddings
|
||||
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.LlamaEmbeddings
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -129,7 +129,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
|
|||
|
||||
.. tab:: Bloom
|
||||
|
||||
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.BloomEmbeddings
|
||||
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.BloomEmbeddings
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -141,7 +141,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
|
|||
|
||||
.. tab:: Gptneox
|
||||
|
||||
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings
|
||||
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -153,7 +153,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
|
|||
|
||||
.. tab:: Starcoder
|
||||
|
||||
.. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings
|
||||
.. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ Optimize Model
|
|||
|
||||
You can run any PyTorch model with ``optimize_model`` through only one-line code change to benefit from BigDL-LLM optimization, regardless of the library or API you are using.
|
||||
|
||||
.. automodule:: bigdl.llm
|
||||
.. automodule:: ipex_llm
|
||||
:members: optimize_model
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -18,7 +18,7 @@ Load Optimized Model
|
|||
|
||||
To avoid high resource consumption during the loading processes of the original model, we provide save/load API to support the saving of model after low-bit optimization and the loading of the saved low-bit model. Saving and loading operations are platform-independent, regardless of their operating systems.
|
||||
|
||||
.. automodule:: bigdl.llm.optimize
|
||||
.. automodule:: ipex_llm.optimize
|
||||
:members: load_low_bit
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ You can apply BigDL-LLM optimizations on any Hugging Face Transformers models by
|
|||
AutoModelForCausalLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: bigdl.llm.transformers.AutoModelForCausalLM
|
||||
.. autoclass:: ipex_llm.transformers.AutoModelForCausalLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -22,7 +22,7 @@ AutoModelForCausalLM
|
|||
AutoModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: bigdl.llm.transformers.AutoModel
|
||||
.. autoclass:: ipex_llm.transformers.AutoModel
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -34,7 +34,7 @@ AutoModel
|
|||
AutoModelForSpeechSeq2Seq
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: bigdl.llm.transformers.AutoModelForSpeechSeq2Seq
|
||||
.. autoclass:: ipex_llm.transformers.AutoModelForSpeechSeq2Seq
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -46,7 +46,7 @@ AutoModelForSpeechSeq2Seq
|
|||
AutoModelForSeq2SeqLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: bigdl.llm.transformers.AutoModelForSeq2SeqLM
|
||||
.. autoclass:: ipex_llm.transformers.AutoModelForSeq2SeqLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -67,7 +67,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
|||
|
||||
.. tab:: Llama
|
||||
|
||||
.. autoclass:: bigdl.llm.transformers.LlamaForCausalLM
|
||||
.. autoclass:: ipex_llm.transformers.LlamaForCausalLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -77,7 +77,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
|||
|
||||
.. tab:: ChatGLM
|
||||
|
||||
.. autoclass:: bigdl.llm.transformers.ChatGLMForCausalLM
|
||||
.. autoclass:: ipex_llm.transformers.ChatGLMForCausalLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -87,7 +87,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
|||
|
||||
.. tab:: Gptneox
|
||||
|
||||
.. autoclass:: bigdl.llm.transformers.GptneoxForCausalLM
|
||||
.. autoclass:: ipex_llm.transformers.GptneoxForCausalLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -96,7 +96,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
|||
.. automethod:: from_pretrained
|
||||
|
||||
.. tab:: Bloom
|
||||
.. autoclass:: bigdl.llm.transformers.BloomForCausalLM
|
||||
.. autoclass:: ipex_llm.transformers.BloomForCausalLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
@ -106,7 +106,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
|||
|
||||
.. tab:: Starcoder
|
||||
|
||||
.. autoclass:: bigdl.llm.transformers.StarcoderForCausalLM
|
||||
.. autoclass:: ipex_llm.transformers.StarcoderForCausalLM
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
|
|
|||
|
|
@ -112,7 +112,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models
|
|||
.. code-block:: python
|
||||
|
||||
#load Hugging Face Transformers model with INT4 optimizations
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||
|
||||
#run the optimized model on Intel CPU
|
||||
|
|
@ -146,7 +146,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models
|
|||
.. code-block:: python
|
||||
|
||||
#load Hugging Face Transformers model with INT4 optimizations
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||
|
||||
#run the optimized model on Intel GPU
|
||||
|
|
|
|||
|
|
@ -146,7 +146,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int
|
|||
|
||||
```python
|
||||
#load Hugging Face Transformers model with INT4 optimizations
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||
|
||||
#run the optimized model on Intel CPU
|
||||
|
|
@ -164,7 +164,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int
|
|||
|
||||
```python
|
||||
#load Hugging Face Transformers model with INT4 optimizations
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
import intel_extension_for_pytorch
|
||||
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
|
||||
|
||||
|
|
@ -206,13 +206,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 model f
|
|||
|
||||
```python
|
||||
#convert the model
|
||||
from bigdl.llm import llm_convert
|
||||
from ipex_llm import llm_convert
|
||||
bigdl_llm_path = llm_convert(model='/path/to/model/',
|
||||
outfile='/path/to/output/', outtype='int4', model_family="llama")
|
||||
|
||||
#load the converted model
|
||||
#switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
|
||||
from bigdl.llm.transformers import LlamaForCausalLM
|
||||
from ipex_llm.transformers import LlamaForCausalLM
|
||||
llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
|
||||
|
||||
#run the converted model
|
||||
|
|
@ -231,8 +231,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
|
|||
You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
|
||||
|
||||
```python
|
||||
from bigdl.llm.langchain.llms import TransformersLLM
|
||||
from bigdl.llm.langchain.embeddings import TransformersEmbeddings
|
||||
from ipex_llm.langchain.llms import TransformersLLM
|
||||
from ipex_llm.langchain.embeddings import TransformersEmbeddings
|
||||
from langchain.chains.question_answering import load_qa_chain
|
||||
|
||||
embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
|
||||
|
|
@ -250,8 +250,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
|
|||
>**Notes**:* Currently only llama/bloom/gptneox/starcoder/chatglm model families are supported; for other models, you may use the Hugging Face `transformers` model format as described above).
|
||||
|
||||
```python
|
||||
from bigdl.llm.langchain.llms import LlamaLLM
|
||||
from bigdl.llm.langchain.embeddings import LlamaEmbeddings
|
||||
from ipex_llm.langchain.llms import LlamaLLM
|
||||
from ipex_llm.langchain.embeddings import LlamaEmbeddings
|
||||
from langchain.chains.question_answering import load_qa_chain
|
||||
|
||||
#switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ Just put this file into your benchmark directory, and then wrap your transformer
|
|||
Take `chatglm-6b` as an example:
|
||||
```python
|
||||
import torch
|
||||
from bigdl.llm.transformers import AutoModel
|
||||
from ipex_llm.transformers import AutoModel
|
||||
from transformers import AutoTokenizer
|
||||
from benchmark_util import BenchmarkWrapper
|
||||
|
||||
|
|
@ -35,7 +35,7 @@ Take `chatglm-6b` as an example:
|
|||
```python
|
||||
import torch
|
||||
import intel_extension_for_pytorch as ipex
|
||||
from bigdl.llm.transformers import AutoModel
|
||||
from ipex_llm.transformers import AutoModel
|
||||
from transformers import AutoTokenizer
|
||||
from benchmark_util import BenchmarkWrapper
|
||||
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ benchmark_util_path = os.path.join(current_dir, '..')
|
|||
import sys
|
||||
sys.path.append(benchmark_util_path)
|
||||
from benchmark_util import BenchmarkWrapper
|
||||
from bigdl.llm.utils.common.log4Error import invalidInputError
|
||||
from ipex_llm.utils.common.log4Error import invalidInputError
|
||||
|
||||
LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
|
||||
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
|
||||
|
|
@ -85,7 +85,7 @@ def run_transformer_int4(repo_id,
|
|||
num_trials,
|
||||
num_beams,
|
||||
low_bit):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
|
|
@ -149,7 +149,7 @@ def run_transformer_int4_gpu(repo_id,
|
|||
num_trials,
|
||||
num_beams,
|
||||
low_bit):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
reserved_mem_list = []
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ benchmark_util_path = os.path.join(current_dir, '..')
|
|||
import sys
|
||||
sys.path.append(benchmark_util_path)
|
||||
from benchmark_util import BenchmarkWrapper
|
||||
from bigdl.llm.utils.common.log4Error import invalidInputError
|
||||
from ipex_llm.utils.common.log4Error import invalidInputError
|
||||
|
||||
LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
|
||||
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
|
||||
|
|
@ -143,8 +143,8 @@ def run_native_int4(repo_id,
|
|||
warm_up,
|
||||
num_trials):
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
from bigdl.llm.transformers import BigdlNativeForCausalLM
|
||||
from bigdl.llm import llm_convert
|
||||
from ipex_llm.transformers import BigdlNativeForCausalLM
|
||||
from ipex_llm import llm_convert
|
||||
if "chatglm" in repo_id.lower():
|
||||
family = "chatglm"
|
||||
elif "llama" in repo_id.lower():
|
||||
|
|
@ -184,7 +184,7 @@ def run_transformer_int4(repo_id,
|
|||
num_beams,
|
||||
low_bit,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
|
|
@ -319,7 +319,7 @@ def run_optimize_model(repo_id,
|
|||
low_bit,
|
||||
batch_size):
|
||||
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
# Load model in 4 bit,
|
||||
|
|
@ -389,7 +389,7 @@ def run_transformer_int4_gpu(repo_id,
|
|||
num_beams,
|
||||
low_bit,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
|
|
@ -495,7 +495,7 @@ def run_optimize_model_gpu(repo_id,
|
|||
low_bit,
|
||||
batch_size):
|
||||
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
import intel_extension_for_pytorch as ipex
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
# Load model in 4 bit,
|
||||
|
|
@ -651,7 +651,7 @@ def run_bigdl_fp16_gpu(repo_id,
|
|||
num_trials,
|
||||
num_beams,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
|
|
@ -731,7 +731,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
|
|||
batch_size):
|
||||
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
|
||||
import deepspeed
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
import argparse
|
||||
# parser is for deepspeed subprocesses' inline parameter
|
||||
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
|
||||
|
|
@ -822,7 +822,7 @@ def run_transformer_int4_gpu_win(repo_id,
|
|||
cpu_embedding,
|
||||
batch_size,
|
||||
streaming):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
|
|
@ -928,7 +928,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
|
|||
cpu_embedding,
|
||||
batch_size,
|
||||
streaming):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
|
|
@ -1038,7 +1038,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
|
|||
cpu_embedding,
|
||||
batch_size,
|
||||
streaming):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
|
|
@ -1140,7 +1140,7 @@ def run_transformer_autocast_bf16( repo_id,
|
|||
num_trials,
|
||||
num_beams,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
|
|
@ -1209,7 +1209,7 @@ def run_bigdl_ipex_bf16(repo_id,
|
|||
num_trials,
|
||||
num_beams,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
os.environ["BIGDL_OPT_IPEX"] = "true"
|
||||
|
|
@ -1280,7 +1280,7 @@ def run_bigdl_ipex_int4(repo_id,
|
|||
num_trials,
|
||||
num_beams,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
os.environ["BIGDL_OPT_IPEX"] = "true"
|
||||
|
|
@ -1350,7 +1350,7 @@ def run_bigdl_ipex_int8(repo_id,
|
|||
num_trials,
|
||||
num_beams,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
os.environ["BIGDL_OPT_IPEX"] = "true"
|
||||
|
|
@ -1434,7 +1434,7 @@ def run_deepspeed_optimize_model_gpu(repo_id,
|
|||
os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500")
|
||||
|
||||
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
import intel_extension_for_pytorch as ipex
|
||||
import deepspeed
|
||||
from deepspeed.accelerator.cpu_accelerator import CPU_Accelerator
|
||||
|
|
@ -1535,9 +1535,9 @@ def run_speculative_cpu(repo_id,
|
|||
num_trials,
|
||||
num_beams,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
from bigdl.llm.transformers.convert import get_enable_ipex
|
||||
from ipex_llm.transformers.convert import get_enable_ipex
|
||||
|
||||
_enable_ipex = get_enable_ipex()
|
||||
|
||||
|
|
@ -1615,7 +1615,7 @@ def run_speculative_gpu(repo_id,
|
|||
num_trials,
|
||||
num_beams,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ current_dir = os.path.dirname(os.path.realpath(__file__))
|
|||
def save_model_in_low_bit(repo_id,
|
||||
local_model_hub,
|
||||
low_bit):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
# Load model in 4 bit,
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ import torch
|
|||
import json
|
||||
from tqdm import tqdm
|
||||
|
||||
from bigdl.llm.utils.common.log4Error import invalidInputError
|
||||
from ipex_llm.utils.common.log4Error import invalidInputError
|
||||
from evaluators.qwen import QwenEvaluator
|
||||
from evaluators.llama import LlamaEvaluator
|
||||
from evaluators.chatglm import ChatGLMEvaluator
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ from thefuzz import process
|
|||
from transformers import AutoTokenizer
|
||||
|
||||
from evaluators.evaluator import Evaluator
|
||||
from bigdl.llm.transformers import AutoModel
|
||||
from ipex_llm.transformers import AutoModel
|
||||
from transformers.generation.utils import LogitsProcessorList
|
||||
from transformers.generation.logits_process import LogitsProcessor
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ import numpy as np
|
|||
import torch
|
||||
from transformers import LlamaTokenizer, GenerationConfig
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from evaluators.evaluator import Evaluator
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ from thefuzz import process
|
|||
from transformers import AutoTokenizer
|
||||
from transformers.generation import GenerationConfig
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from evaluators.evaluator import Evaluator
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
import inspect
|
||||
from lm_eval.models.huggingface import AutoCausalLM
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ from torch.nn import CrossEntropyLoss
|
|||
from tqdm import tqdm
|
||||
import gc
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
|
||||
from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
|
||||
|
||||
class BigDLPPL:
|
||||
def __init__(self, model_path, device, **model_kwargs) -> None:
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ from datasets import concatenate_datasets, load_dataset
|
|||
from transformers import AutoTokenizer
|
||||
|
||||
from ppl import BigDLPPL
|
||||
from bigdl.llm.ggml.quantize import ggml_tensor_qtype
|
||||
from ipex_llm.ggml.quantize import ggml_tensor_qtype
|
||||
|
||||
import os
|
||||
import json
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@
|
|||
#
|
||||
|
||||
from datasets import load_dataset
|
||||
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
|
||||
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
|
||||
from transformers import WhisperProcessor
|
||||
import torch
|
||||
from evaluate import load
|
||||
|
|
|
|||
|
|
@ -69,11 +69,11 @@ conda activate autogen
|
|||
cd autogen
|
||||
|
||||
# load the local model with cpu with your downloaded model
|
||||
python -m bigdl.llm.serving.model_worker --model-path ... --device cpu
|
||||
python -m ipex_llm.serving.model_worker --model-path ... --device cpu
|
||||
```
|
||||
|
||||
Change the Model Name:
|
||||
> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m bigdl.llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat.
|
||||
> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m ipex_llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat.
|
||||
|
||||
Potential Error Note:
|
||||
> If you get `RuntimeError: Error register to Controller` in the worker terminal, please set `export no_proxy='localhost'` to ensure the registration
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import argparse
|
|||
from PIL import Image
|
||||
from transformers import AutoTokenizer, LocalAgent
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run agent using vicuna model")
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
In this example, we apply low-bit optimizations to [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs.
|
||||
Only one code change is needed to load the model using bigdl-llm as follows:
|
||||
```python
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ import urllib.request
|
|||
import os
|
||||
import json
|
||||
# code change to import from bigdl-llm API instead of using transformers API
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import LlamaTokenizer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ Distributed model managed by deepspeed can be further optimized with BigDL low-b
|
|||
|
||||
```python
|
||||
# Apply BigDL-LLM INT4 optimizations on transformers
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
|
||||
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
|
||||
model = model.to(f'cpu:{local_rank}') # move partial model to local rank
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ import os
|
|||
import torch
|
||||
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
|
||||
import deepspeed
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
import torch
|
||||
import intel_extension_for_pytorch as ipex
|
||||
import time
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
# here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import LlamaTokenizer, GPTQConfig
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModel
|
||||
from ipex_llm.transformers import AutoModel
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModel
|
||||
from ipex_llm.transformers import AutoModel
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModel
|
||||
from ipex_llm.transformers import AutoModel
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModel
|
||||
from ipex_llm.transformers import AutoModel
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModel
|
||||
from ipex_llm.transformers import AutoModel
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import CodeLlamaTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
# here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ if __name__ == '__main__':
|
|||
|
||||
# Load model in 4 bit,
|
||||
# which convert the relevant layers in the model into INT4 format
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True).eval()
|
||||
model.generation_config = GenerationConfig.from_pretrained(model_path)
|
||||
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
|
||||
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
|
||||
from datasets import load_dataset
|
||||
from transformers import pipeline
|
||||
from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -301,7 +301,7 @@ class Attention(nn.Module):
|
|||
# resize qk to 4D to match apply_rotary_pos_emb_no_cache_xpu's requirements.
|
||||
query_layer = query_layer.reshape(batch_size, self.num_heads, q_length, self.head_dim)
|
||||
key_layer = key_layer.reshape(batch_size, self.num_kv, q_length, self.head_dim)
|
||||
from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
|
||||
from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
|
||||
query_layer, key_layer = apply_rotary_pos_emb_no_cache_xpu(query_layer,
|
||||
key_layer,
|
||||
position_ids,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForSeq2SeqLM
|
||||
from ipex_llm.transformers import AutoModelForSeq2SeqLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import torch
|
|||
import argparse
|
||||
import time
|
||||
from PIL import Image
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Fuyu model')
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# The instruction-tuned models use a chat template that must be adhered to for conversational use.
|
||||
|
|
|
|||
|
|
@ -14,14 +14,14 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.generation import GenerationConfig
|
||||
import torch
|
||||
import time
|
||||
import os
|
||||
import argparse
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for InternLM-XComposer model')
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ if __name__ == '__main__':
|
|||
model_path = args.repo_id_or_model_path
|
||||
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
load_in_4bit=True,
|
||||
trust_remote_code=True)
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import LlamaTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GenerationConfig
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GenerationConfig
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GenerationConfig
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ import argparse
|
|||
import numpy as np
|
||||
|
||||
from transformers import AutoTokenizer, GenerationConfig
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
# you could tune the prompt based on your own model,
|
||||
# here the prompt tuning refers to # TODO: https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py
|
||||
PHI1_5_PROMPT_FORMAT = " Question:{prompt}\n\n Answer:"
|
||||
|
|
@ -41,7 +41,7 @@ if __name__ == '__main__':
|
|||
|
||||
# Load model in 4 bit,
|
||||
# which convert the relevant layers in the model into INT4 format
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
load_in_4bit=True,
|
||||
trust_remote_code=True)
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could revise it based on the Phoenix model you choose to use
|
||||
|
|
|
|||
|
|
@ -14,14 +14,14 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
from transformers.generation import GenerationConfig
|
||||
import torch
|
||||
import time
|
||||
import os
|
||||
import argparse
|
||||
from bigdl.llm import optimize_model
|
||||
from ipex_llm import optimize_model
|
||||
torch.manual_seed(1234)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ if __name__ == '__main__':
|
|||
model_path = args.repo_id_or_model_path
|
||||
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
load_in_4bit=True,
|
||||
trust_remote_code=True)
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import time
|
|||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import LlamaTokenizer
|
||||
|
||||
# you could tune the prompt based on your own model,
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import librosa
|
|||
import argparse
|
||||
|
||||
from transformers import pipeline
|
||||
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
|
||||
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
|
||||
from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
|
||||
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
|
||||
from transformers import WhisperProcessor
|
||||
from datasets import load_dataset
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import LlamaTokenizer
|
||||
|
||||
WIZARDCODERPYTHON_PROMPT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch
|
|||
import time
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import torch, transformers
|
|||
import sys, os, time
|
||||
import argparse
|
||||
from transformers import LlamaTokenizer
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
# Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage
|
||||
YUAN2_PROMPT_FORMAT = """
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ if __name__ == '__main__':
|
|||
model_path = args.repo_id_or_model_path
|
||||
|
||||
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
# enabling `use_cache=True` allows the model to utilize the previous
|
||||
# key/values attentions to speed up decoding;
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@
|
|||
#
|
||||
|
||||
import argparse
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import LlamaTokenizer, TextGenerationPipeline
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
@ -38,7 +38,7 @@ if __name__ == '__main__':
|
|||
model = AutoModelForCausalLM.load_low_bit(load_path)
|
||||
tokenizer = LlamaTokenizer.from_pretrained(load_path)
|
||||
else:
|
||||
# load_in_low_bit in bigdl.llm.transformers will convert
|
||||
# load_in_low_bit in ipex_llm.transformers will convert
|
||||
# the relevant layers in the model into corresponding int X format
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@
|
|||
#
|
||||
|
||||
import argparse
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import LlamaTokenizer, TextGenerationPipeline
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
@ -38,7 +38,7 @@ if __name__ == '__main__':
|
|||
model = AutoModelForCausalLM.load_low_bit(load_path)
|
||||
tokenizer = LlamaTokenizer.from_pretrained(load_path)
|
||||
else:
|
||||
# load_in_low_bit in bigdl.llm.transformers will convert
|
||||
# load_in_low_bit in ipex_llm.transformers will convert
|
||||
# the relevant layers in the model into corresponding int X format
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
|
|
|||
|
|
@ -31,8 +31,8 @@ from langchain.chains.question_answering import load_qa_chain
|
|||
from langchain.callbacks.manager import CallbackManager
|
||||
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
||||
|
||||
from bigdl.llm.langchain.llms import *
|
||||
from bigdl.llm.langchain.embeddings import *
|
||||
from ipex_llm.langchain.llms import *
|
||||
from ipex_llm.langchain.embeddings import *
|
||||
|
||||
|
||||
def main(args):
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@
|
|||
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.langchain.llms import *
|
||||
from ipex_llm.langchain.llms import *
|
||||
from langchain import PromptTemplate, LLMChain
|
||||
from langchain.callbacks.manager import CallbackManager
|
||||
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
|
||||
from langchain import LLMChain, PromptTemplate
|
||||
from bigdl.llm.langchain.llms import *
|
||||
from ipex_llm.langchain.llms import *
|
||||
from langchain.memory import ConversationBufferWindowMemory
|
||||
from langchain.callbacks.manager import CallbackManager
|
||||
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@
|
|||
|
||||
import argparse
|
||||
|
||||
from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
|
||||
from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
|
||||
from langchain import PromptTemplate, LLMChain
|
||||
from langchain import HuggingFacePipeline
|
||||
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@
|
|||
import argparse
|
||||
|
||||
from langchain.chains import LLMMathChain
|
||||
from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
|
||||
from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
|
||||
|
||||
|
||||
def main(args):
|
||||
|
|
|
|||
|
|
@ -30,8 +30,8 @@ from langchain.text_splitter import CharacterTextSplitter
|
|||
from langchain.chains.question_answering import load_qa_chain
|
||||
from langchain.callbacks.manager import CallbackManager
|
||||
|
||||
from bigdl.llm.langchain.llms import TransformersLLM
|
||||
from bigdl.llm.langchain.embeddings import TransformersEmbeddings
|
||||
from ipex_llm.langchain.llms import TransformersLLM
|
||||
from ipex_llm.langchain.embeddings import TransformersEmbeddings
|
||||
|
||||
text_doc = '''
|
||||
BigDL seamlessly scales your data analytics & AI applications from laptop to cloud, with the following libraries:
|
||||
|
|
|
|||
|
|
@ -23,9 +23,9 @@
|
|||
|
||||
|
||||
from langchain import LLMChain, PromptTemplate
|
||||
from bigdl.llm.langchain.llms import TransformersLLM
|
||||
from ipex_llm.langchain.llms import TransformersLLM
|
||||
from langchain.memory import ConversationBufferWindowMemory
|
||||
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
|
||||
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
|
||||
from transformers import WhisperProcessor
|
||||
import speech_recognition as sr
|
||||
import numpy as np
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue