diff --git a/README.md b/README.md index efa55382..4c6110fe 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo ```python #load Hugging Face Transformers model with INT4 optimizations -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) #run the optimized model on CPU @@ -113,7 +113,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo ```python #load Hugging Face Transformers model with INT4 optimizations -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) #run the optimized model on Intel GPU diff --git a/docker/llm/README.md b/docker/llm/README.md index 2eb47b61..1f418bb9 100644 --- a/docker/llm/README.md +++ b/docker/llm/README.md @@ -223,7 +223,7 @@ This controller manages the distributed workers. ##### Launch the model worker(s) ```bash -python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu +python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu ``` Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller. @@ -252,7 +252,7 @@ python3 -m fastchat.serve.controller Then, launch the model worker(s): ```bash -python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu +python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu ``` Finally, launch the RESTful API server @@ -319,7 +319,7 @@ This controller manages the distributed workers. ##### Launch the model worker(s) ```bash -python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu +python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu ``` Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller. @@ -346,7 +346,7 @@ python3 -m fastchat.serve.controller Then, launch the model worker(s): ```bash -python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu +python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu ``` Finally, launch the RESTful API server diff --git a/docker/llm/inference/xpu/docker/chat.py b/docker/llm/inference/xpu/docker/chat.py index b40c5f42..6401a86b 100644 --- a/docker/llm/inference/xpu/docker/chat.py +++ b/docker/llm/inference/xpu/docker/chat.py @@ -23,7 +23,7 @@ from transformers import TextIteratorStreamer from transformers.tools.agents import StopSequenceCriteria from transformers.generation.stopping_criteria import StoppingCriteriaList from colorama import Fore -from bigdl.llm import optimize_model +from ipex_llm import optimize_model SYSTEM_PROMPT = "A chat between a curious human and an artificial intelligence assistant .\ The assistant gives helpful, detailed, and polite answers to the human's questions." HUMAN_ID = "" diff --git a/docker/llm/serving/cpu/docker/entrypoint.sh b/docker/llm/serving/cpu/docker/entrypoint.sh index 87a691c7..36217dd2 100644 --- a/docker/llm/serving/cpu/docker/entrypoint.sh +++ b/docker/llm/serving/cpu/docker/entrypoint.sh @@ -135,9 +135,9 @@ else done if [ "$worker_type" == "model_worker" ]; then - worker_type="bigdl.llm.serving.model_worker" + worker_type="ipex_llm.serving.model_worker" elif [ "$worker_type" == "vllm_worker" ]; then - worker_type="bigdl.llm.serving.vllm_worker" + worker_type="ipex_llm.serving.vllm_worker" fi if [[ -n $CONTROLLER_HOST ]]; then @@ -220,9 +220,9 @@ else echo "Worker type: $worker_type" echo "Worker address: $worker_address" echo "Controller address: $controller_address" - if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then + if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval - elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then + elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address fi fi diff --git a/docker/llm/serving/cpu/docker/model_adapter.py.patch b/docker/llm/serving/cpu/docker/model_adapter.py.patch index b9a68a3a..6bd43d5a 100644 --- a/docker/llm/serving/cpu/docker/model_adapter.py.patch +++ b/docker/llm/serving/cpu/docker/model_adapter.py.patch @@ -9,7 +9,7 @@ generation_config = GenerationConfig.from_pretrained( model_path, trust_remote_code=True ) -+ from bigdl.llm.transformers import AutoModelForCausalLM ++ from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained( model_path, config=config, diff --git a/docker/llm/serving/xpu/docker/entrypoint.sh b/docker/llm/serving/xpu/docker/entrypoint.sh index 705797f3..c5f0e92c 100644 --- a/docker/llm/serving/xpu/docker/entrypoint.sh +++ b/docker/llm/serving/xpu/docker/entrypoint.sh @@ -66,9 +66,9 @@ else done if [ "$worker_type" == "model_worker" ]; then - worker_type="bigdl.llm.serving.model_worker" + worker_type="ipex_llm.serving.model_worker" elif [ "$worker_type" == "vllm_worker" ]; then - worker_type="bigdl.llm.serving.vllm_worker" + worker_type="ipex_llm.serving.vllm_worker" fi if [[ -n $CONTROLLER_HOST ]]; then @@ -127,9 +127,9 @@ else echo "Worker address: $worker_address" echo "Controller address: $controller_address" - if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then + if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval - elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then + elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address fi fi diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/finetune.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/finetune.md index 70157907..e4cf8700 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/finetune.md +++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/finetune.md @@ -21,7 +21,7 @@ To help you better understand the finetuning process, here we use model [Llama-2 First, load model using `transformers`-style API and **set it to `to('xpu')`**. We specify `load_in_low_bit="nf4"` here to apply 4-bit NormalFloat optimization. According to the [QLoRA paper](https://arxiv.org/pdf/2305.14314.pdf), using `"nf4"` could yield better model quality than `"int4"`. ```python -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", load_in_low_bit="nf4", @@ -33,14 +33,14 @@ model = model.to('xpu') Then, we have to apply some preprocessing to the model to prepare it for training. ```python -from bigdl.llm.transformers.qlora import prepare_model_for_kbit_training +from ipex_llm.transformers.qlora import prepare_model_for_kbit_training model.gradient_checkpointing_enable() model = prepare_model_for_kbit_training(model) ``` Next, we can obtain a Peft model from the optimized model and a configuration object containing the parameters as follows: ```python -from bigdl.llm.transformers.qlora import get_peft_model +from ipex_llm.transformers.qlora import get_peft_model from peft import LoraConfig config = LoraConfig(r=8, lora_alpha=32, @@ -54,7 +54,7 @@ model = get_peft_model(model, config) ```eval_rst .. important:: - Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``bigdl.llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``. + Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``ipex_llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``. ``` ```eval_rst diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/hugging_face_format.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/hugging_face_format.md index ef3c6238..387d14d0 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/hugging_face_format.md +++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/hugging_face_format.md @@ -5,7 +5,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo ```python # load Hugging Face Transformers model with INT4 optimizations -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) ``` diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/inference_on_gpu.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/inference_on_gpu.md index e76a0f73..332a5c1f 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/inference_on_gpu.md +++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/inference_on_gpu.md @@ -29,7 +29,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`- # Take Llama-2-7b-chat-hf as an example from transformers import LlamaForCausalLM - from bigdl.llm import optimize_model + from ipex_llm import optimize_model model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True) model = optimize_model(model) # With only one line to enable BigDL-LLM INT4 optimization @@ -40,14 +40,14 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`- When running LLMs on Intel iGPUs for Windows users, we recommend setting ``cpu_embedding=True`` in the ``optimize_model`` function. This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. - See the `API doc <../../../PythonAPI/LLM/optimize.html#bigdl.llm.optimize_model>`_ for ``optimize_model`` to find more information. + See the `API doc <../../../PythonAPI/LLM/optimize.html#ipex_llm.optimize_model>`_ for ``optimize_model`` to find more information. Especially, if you have saved the optimized model following setps `here <./optimize_model.html#save>`_, the loading process on Intel GPUs maybe as follows: .. code-block:: python from transformers import LlamaForCausalLM - from bigdl.llm.optimize import low_memory_init, load_low_bit + from ipex_llm.optimize import low_memory_init, load_low_bit saved_dir='./llama-2-bigdl-llm-4-bit' with low_memory_init(): # Fast and low cost by loading model on meta device @@ -65,7 +65,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`- .. code-block:: python # Take Llama-2-7b-chat-hf as an example - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM # Load model in 4 bit, which convert the relevant layers in the model into INT4 format model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', load_in_4bit=True) @@ -82,7 +82,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`- .. code-block:: python - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM saved_dir='./llama-2-bigdl-llm-4-bit' model = AutoModelForCausalLM.load_low_bit(saved_dir) # Load the optimized model diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md index 8ec3f433..962f4b2a 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md +++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md @@ -7,8 +7,8 @@ You may run the models using the LangChain API in `bigdl-llm`. You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows: ```python -from bigdl.llm.langchain.llms import TransformersLLM -from bigdl.llm.langchain.embeddings import TransformersEmbeddings +from ipex_llm.langchain.llms import TransformersLLM +from ipex_llm.langchain.embeddings import TransformersEmbeddings from langchain.chains.question_answering import load_qa_chain embeddings = TransformersEmbeddings.from_model_id(model_id=model_path) @@ -37,8 +37,8 @@ You may also convert Hugging Face *Transformers* models into native INT4 format, ``` ```python -from bigdl.llm.langchain.llms import LlamaLLM -from bigdl.llm.langchain.embeddings import LlamaEmbeddings +from ipex_llm.langchain.llms import LlamaLLM +from ipex_llm.langchain.embeddings import LlamaEmbeddings from langchain.chains.question_answering import load_qa_chain # switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/native_format.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/native_format.md index e66d68fd..49184835 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/native_format.md +++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/native_format.md @@ -10,13 +10,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 format ```python # convert the model -from bigdl.llm import llm_convert +from ipex_llm import llm_convert bigdl_llm_path = llm_convert(model='/path/to/model/', outfile='/path/to/output/', outtype='int4', model_family="llama") # load the converted model # switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models -from bigdl.llm.transformers import LlamaForCausalLM +from ipex_llm.transformers import LlamaForCausalLM llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...) # run the converted model diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md index e997d32c..9c640e8d 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md +++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md @@ -14,7 +14,7 @@ model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_ Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default: ```python -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # With only one line to enable BigDL-LLM INT4 optimization model = optimize_model(model) @@ -31,7 +31,7 @@ Currently, ``low_bit`` supports options 'sym_int4', 'asym_int4', 'sym_int5', 'as You may apply symmetric INT8 optimization as follows: ```python -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # Apply symmetric INT8 optimization model = optimize_model(model, low_bit="sym_int8") @@ -51,7 +51,7 @@ model.save_low_bit(saved_dir) We recommend to use the context manager `low_memory_init` to quickly initiate a model instance with low cost, and then use `load_low_bit` to load the optimized low-bit model as follows: ```python -from bigdl.llm.optimize import low_memory_init, load_low_bit +from ipex_llm.optimize import low_memory_init, load_low_bit with low_memory_init(): # Fast and low cost by loading model on meta device model = LlamaForCausalLM.from_pretrained(saved_dir, torch_dtype="auto", diff --git a/docs/readthedocs/source/doc/LLM/Overview/llm.md b/docs/readthedocs/source/doc/LLM/Overview/llm.md index a13605a5..7f7d4194 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/llm.md +++ b/docs/readthedocs/source/doc/LLM/Overview/llm.md @@ -11,7 +11,7 @@ Here, let's take a relatively small LLM model, i.e [open_llama_3b_v2](https://hu Simply use one-line `transformers`-style API in `bigdl-llm` to load `open_llama_3b_v2` with INT4 optimization (by specifying `load_in_4bit=True`) as follows: ```python -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="openlm-research/open_llama_3b_v2", load_in_4bit=True) diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md b/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md index 96a5b4b3..efdf7d10 100644 --- a/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md +++ b/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md @@ -112,7 +112,7 @@ Install the Miniconda as follows if you don't have conda installed on your machi python - > from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + > from ipex_llm.transformers import AutoModel, AutoModelForCausalLM ``` > image-20240221102252562 @@ -170,7 +170,7 @@ Now let's play with a real LLM. We'll be using the [phi-1.5](https://huggingface ```python # Copy/Paste the contents to a new file demo.py import torch - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer, GenerationConfig generation_config = GenerationConfig(use_cache = True) diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md b/docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md index 88baeb9b..370422d1 100644 --- a/docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md +++ b/docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md @@ -130,7 +130,7 @@ You can verify if `bigdl-llm` is successfully installed by simply running a few * Step 5: Copy following code to Anaconda prompt **line by line** and press Enter **after copying each line**. ```python import torch - from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM + from ipex_llm.transformers import AutoModel,AutoModelForCausalLM tensor_1 = torch.randn(1, 1, 40, 128).to('xpu') tensor_2 = torch.randn(1, 1, 128, 40).to('xpu') print(torch.matmul(tensor_1, tensor_2).size()) @@ -200,7 +200,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg # Copy/Paste the contents to a new file demo.py import torch - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer, GenerationConfig generation_config = GenerationConfig(use_cache=True) @@ -260,7 +260,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg # Copy/Paste the contents to a new file demo.py import torch - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM from transformers import GenerationConfig from modelscope import AutoTokenizer generation_config = GenerationConfig(use_cache=True) diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst index 445e71f8..bf0fa88d 100644 --- a/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst +++ b/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst @@ -13,7 +13,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im .. tab:: AutoModel - .. automodule:: bigdl.llm.langchain.llms.transformersllm + .. automodule:: ipex_llm.langchain.llms.transformersllm :members: :undoc-members: :show-inheritance: @@ -21,7 +21,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im .. tab:: pipeline - .. automodule:: bigdl.llm.langchain.llms.transformerspipelinellm + .. automodule:: ipex_llm.langchain.llms.transformerspipelinellm :members: :undoc-members: :show-inheritance: @@ -37,7 +37,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo .. tab:: Llama - .. autoclass:: bigdl.llm.langchain.llms.bigdlllm.LlamaLLM + .. autoclass:: ipex_llm.langchain.llms.bigdlllm.LlamaLLM :members: :undoc-members: :show-inheritance: @@ -49,7 +49,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo .. tab:: ChatGLM - .. autoclass:: bigdl.llm.langchain.llms.bigdlllm.ChatGLMLLM + .. autoclass:: ipex_llm.langchain.llms.bigdlllm.ChatGLMLLM :members: :undoc-members: :show-inheritance: @@ -61,7 +61,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo .. tab:: Bloom - .. autoclass:: bigdl.llm.langchain.llms.bigdlllm.BloomLLM + .. autoclass:: ipex_llm.langchain.llms.bigdlllm.BloomLLM :members: :undoc-members: :show-inheritance: @@ -73,7 +73,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo .. tab:: Gptneox - .. autoclass:: bigdl.llm.langchain.llms.bigdlllm.GptneoxLLM + .. autoclass:: ipex_llm.langchain.llms.bigdlllm.GptneoxLLM :members: :undoc-members: :show-inheritance: @@ -85,7 +85,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo .. tab:: Starcoder - .. autoclass:: bigdl.llm.langchain.llms.bigdlllm.StarcoderLLM + .. autoclass:: ipex_llm.langchain.llms.bigdlllm.StarcoderLLM :members: :undoc-members: :show-inheritance: @@ -102,7 +102,7 @@ Embeddings Wrapper of LangChain Hugging Face ``transformers`` AutoModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: bigdl.llm.langchain.embeddings.transformersembeddings +.. automodule:: ipex_llm.langchain.embeddings.transformersembeddings :members: :undoc-members: :show-inheritance: @@ -117,7 +117,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also .. tab:: Llama - .. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.LlamaEmbeddings + .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.LlamaEmbeddings :members: :undoc-members: :show-inheritance: @@ -129,7 +129,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also .. tab:: Bloom - .. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.BloomEmbeddings + .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.BloomEmbeddings :members: :undoc-members: :show-inheritance: @@ -141,7 +141,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also .. tab:: Gptneox - .. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings + .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings :members: :undoc-members: :show-inheritance: @@ -153,7 +153,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also .. tab:: Starcoder - .. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings + .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings :members: :undoc-members: :show-inheritance: diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst index f28211ca..d979376e 100644 --- a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst +++ b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst @@ -6,7 +6,7 @@ Optimize Model You can run any PyTorch model with ``optimize_model`` through only one-line code change to benefit from BigDL-LLM optimization, regardless of the library or API you are using. -.. automodule:: bigdl.llm +.. automodule:: ipex_llm :members: optimize_model :undoc-members: :show-inheritance: @@ -18,7 +18,7 @@ Load Optimized Model To avoid high resource consumption during the loading processes of the original model, we provide save/load API to support the saving of model after low-bit optimization and the loading of the saved low-bit model. Saving and loading operations are platform-independent, regardless of their operating systems. -.. automodule:: bigdl.llm.optimize +.. automodule:: ipex_llm.optimize :members: load_low_bit :undoc-members: :show-inheritance: diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/transformers.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/transformers.rst index 23aa10a3..711f397a 100644 --- a/docs/readthedocs/source/doc/PythonAPI/LLM/transformers.rst +++ b/docs/readthedocs/source/doc/PythonAPI/LLM/transformers.rst @@ -10,7 +10,7 @@ You can apply BigDL-LLM optimizations on any Hugging Face Transformers models by AutoModelForCausalLM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: bigdl.llm.transformers.AutoModelForCausalLM +.. autoclass:: ipex_llm.transformers.AutoModelForCausalLM :members: :undoc-members: :show-inheritance: @@ -22,7 +22,7 @@ AutoModelForCausalLM AutoModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: bigdl.llm.transformers.AutoModel +.. autoclass:: ipex_llm.transformers.AutoModel :members: :undoc-members: :show-inheritance: @@ -34,7 +34,7 @@ AutoModel AutoModelForSpeechSeq2Seq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: bigdl.llm.transformers.AutoModelForSpeechSeq2Seq +.. autoclass:: ipex_llm.transformers.AutoModelForSpeechSeq2Seq :members: :undoc-members: :show-inheritance: @@ -46,7 +46,7 @@ AutoModelForSpeechSeq2Seq AutoModelForSeq2SeqLM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: bigdl.llm.transformers.AutoModelForSeq2SeqLM +.. autoclass:: ipex_llm.transformers.AutoModelForSeq2SeqLM :members: :undoc-members: :show-inheritance: @@ -67,7 +67,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo .. tab:: Llama - .. autoclass:: bigdl.llm.transformers.LlamaForCausalLM + .. autoclass:: ipex_llm.transformers.LlamaForCausalLM :members: :undoc-members: :show-inheritance: @@ -77,7 +77,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo .. tab:: ChatGLM - .. autoclass:: bigdl.llm.transformers.ChatGLMForCausalLM + .. autoclass:: ipex_llm.transformers.ChatGLMForCausalLM :members: :undoc-members: :show-inheritance: @@ -87,7 +87,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo .. tab:: Gptneox - .. autoclass:: bigdl.llm.transformers.GptneoxForCausalLM + .. autoclass:: ipex_llm.transformers.GptneoxForCausalLM :members: :undoc-members: :show-inheritance: @@ -96,7 +96,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo .. automethod:: from_pretrained .. tab:: Bloom - .. autoclass:: bigdl.llm.transformers.BloomForCausalLM + .. autoclass:: ipex_llm.transformers.BloomForCausalLM :members: :undoc-members: :show-inheritance: @@ -106,7 +106,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo .. tab:: Starcoder - .. autoclass:: bigdl.llm.transformers.StarcoderForCausalLM + .. autoclass:: ipex_llm.transformers.StarcoderForCausalLM :members: :undoc-members: :show-inheritance: diff --git a/docs/readthedocs/source/index.rst b/docs/readthedocs/source/index.rst index b1d29875..958cd6d9 100644 --- a/docs/readthedocs/source/index.rst +++ b/docs/readthedocs/source/index.rst @@ -112,7 +112,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models .. code-block:: python #load Hugging Face Transformers model with INT4 optimizations - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) #run the optimized model on Intel CPU @@ -146,7 +146,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models .. code-block:: python #load Hugging Face Transformers model with INT4 optimizations - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) #run the optimized model on Intel GPU diff --git a/python/llm/README.md b/python/llm/README.md index 4ddced57..dc7df9ff 100644 --- a/python/llm/README.md +++ b/python/llm/README.md @@ -146,7 +146,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int ```python #load Hugging Face Transformers model with INT4 optimizations -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) #run the optimized model on Intel CPU @@ -164,7 +164,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int ```python #load Hugging Face Transformers model with INT4 optimizations -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM import intel_extension_for_pytorch model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True) @@ -206,13 +206,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 model f ```python #convert the model -from bigdl.llm import llm_convert +from ipex_llm import llm_convert bigdl_llm_path = llm_convert(model='/path/to/model/', outfile='/path/to/output/', outtype='int4', model_family="llama") #load the converted model #switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models -from bigdl.llm.transformers import LlamaForCausalLM +from ipex_llm.transformers import LlamaForCausalLM llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...) #run the converted model @@ -231,8 +231,8 @@ You may run the models using the LangChain API in `bigdl-llm`. You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows: ```python - from bigdl.llm.langchain.llms import TransformersLLM - from bigdl.llm.langchain.embeddings import TransformersEmbeddings + from ipex_llm.langchain.llms import TransformersLLM + from ipex_llm.langchain.embeddings import TransformersEmbeddings from langchain.chains.question_answering import load_qa_chain embeddings = TransformersEmbeddings.from_model_id(model_id=model_path) @@ -250,8 +250,8 @@ You may run the models using the LangChain API in `bigdl-llm`. >**Notes**:* Currently only llama/bloom/gptneox/starcoder/chatglm model families are supported; for other models, you may use the Hugging Face `transformers` model format as described above). ```python - from bigdl.llm.langchain.llms import LlamaLLM - from bigdl.llm.langchain.embeddings import LlamaEmbeddings + from ipex_llm.langchain.llms import LlamaLLM + from ipex_llm.langchain.embeddings import LlamaEmbeddings from langchain.chains.question_answering import load_qa_chain #switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models diff --git a/python/llm/dev/benchmark/README.md b/python/llm/dev/benchmark/README.md index bd297133..c44a7c7a 100644 --- a/python/llm/dev/benchmark/README.md +++ b/python/llm/dev/benchmark/README.md @@ -7,7 +7,7 @@ Just put this file into your benchmark directory, and then wrap your transformer Take `chatglm-6b` as an example: ```python import torch -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer from benchmark_util import BenchmarkWrapper @@ -35,7 +35,7 @@ Take `chatglm-6b` as an example: ```python import torch import intel_extension_for_pytorch as ipex -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer from benchmark_util import BenchmarkWrapper diff --git a/python/llm/dev/benchmark/all-in-one/run-stress-test.py b/python/llm/dev/benchmark/all-in-one/run-stress-test.py index 20e16c53..9de9cfaa 100644 --- a/python/llm/dev/benchmark/all-in-one/run-stress-test.py +++ b/python/llm/dev/benchmark/all-in-one/run-stress-test.py @@ -31,7 +31,7 @@ benchmark_util_path = os.path.join(current_dir, '..') import sys sys.path.append(benchmark_util_path) from benchmark_util import BenchmarkWrapper -from bigdl.llm.utils.common.log4Error import invalidInputError +from ipex_llm.utils.common.log4Error import invalidInputError LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf', 'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf', @@ -85,7 +85,7 @@ def run_transformer_int4(repo_id, num_trials, num_beams, low_bit): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer model_path = get_model_path(repo_id, local_model_hub) @@ -149,7 +149,7 @@ def run_transformer_int4_gpu(repo_id, num_trials, num_beams, low_bit): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer import intel_extension_for_pytorch as ipex reserved_mem_list = [] diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 3a9b5347..f5c0ecfa 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -32,7 +32,7 @@ benchmark_util_path = os.path.join(current_dir, '..') import sys sys.path.append(benchmark_util_path) from benchmark_util import BenchmarkWrapper -from bigdl.llm.utils.common.log4Error import invalidInputError +from ipex_llm.utils.common.log4Error import invalidInputError LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf', 'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf', @@ -143,8 +143,8 @@ def run_native_int4(repo_id, warm_up, num_trials): model_path = get_model_path(repo_id, local_model_hub) - from bigdl.llm.transformers import BigdlNativeForCausalLM - from bigdl.llm import llm_convert + from ipex_llm.transformers import BigdlNativeForCausalLM + from ipex_llm import llm_convert if "chatglm" in repo_id.lower(): family = "chatglm" elif "llama" in repo_id.lower(): @@ -184,7 +184,7 @@ def run_transformer_int4(repo_id, num_beams, low_bit, batch_size): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer model_path = get_model_path(repo_id, local_model_hub) @@ -319,7 +319,7 @@ def run_optimize_model(repo_id, low_bit, batch_size): from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer - from bigdl.llm import optimize_model + from ipex_llm import optimize_model model_path = get_model_path(repo_id, local_model_hub) # Load model in 4 bit, @@ -389,7 +389,7 @@ def run_transformer_int4_gpu(repo_id, num_beams, low_bit, batch_size): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer import intel_extension_for_pytorch as ipex model_path = get_model_path(repo_id, local_model_hub) @@ -495,7 +495,7 @@ def run_optimize_model_gpu(repo_id, low_bit, batch_size): from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer - from bigdl.llm import optimize_model + from ipex_llm import optimize_model import intel_extension_for_pytorch as ipex model_path = get_model_path(repo_id, local_model_hub) # Load model in 4 bit, @@ -651,7 +651,7 @@ def run_bigdl_fp16_gpu(repo_id, num_trials, num_beams, batch_size): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer import intel_extension_for_pytorch as ipex model_path = get_model_path(repo_id, local_model_hub) @@ -731,7 +731,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id, batch_size): from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer import deepspeed - from bigdl.llm import optimize_model + from ipex_llm import optimize_model import argparse # parser is for deepspeed subprocesses' inline parameter parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model') @@ -822,7 +822,7 @@ def run_transformer_int4_gpu_win(repo_id, cpu_embedding, batch_size, streaming): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer import intel_extension_for_pytorch as ipex model_path = get_model_path(repo_id, local_model_hub) @@ -928,7 +928,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id, cpu_embedding, batch_size, streaming): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer import intel_extension_for_pytorch as ipex model_path = get_model_path(repo_id, local_model_hub) @@ -1038,7 +1038,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id, cpu_embedding, batch_size, streaming): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer import intel_extension_for_pytorch as ipex model_path = get_model_path(repo_id, local_model_hub) @@ -1140,7 +1140,7 @@ def run_transformer_autocast_bf16( repo_id, num_trials, num_beams, batch_size): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer model_path = get_model_path(repo_id, local_model_hub) @@ -1209,7 +1209,7 @@ def run_bigdl_ipex_bf16(repo_id, num_trials, num_beams, batch_size): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer os.environ["BIGDL_OPT_IPEX"] = "true" @@ -1280,7 +1280,7 @@ def run_bigdl_ipex_int4(repo_id, num_trials, num_beams, batch_size): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer os.environ["BIGDL_OPT_IPEX"] = "true" @@ -1350,7 +1350,7 @@ def run_bigdl_ipex_int8(repo_id, num_trials, num_beams, batch_size): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer os.environ["BIGDL_OPT_IPEX"] = "true" @@ -1434,7 +1434,7 @@ def run_deepspeed_optimize_model_gpu(repo_id, os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500") from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer - from bigdl.llm import optimize_model + from ipex_llm import optimize_model import intel_extension_for_pytorch as ipex import deepspeed from deepspeed.accelerator.cpu_accelerator import CPU_Accelerator @@ -1535,9 +1535,9 @@ def run_speculative_cpu(repo_id, num_trials, num_beams, batch_size): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer - from bigdl.llm.transformers.convert import get_enable_ipex + from ipex_llm.transformers.convert import get_enable_ipex _enable_ipex = get_enable_ipex() @@ -1615,7 +1615,7 @@ def run_speculative_gpu(repo_id, num_trials, num_beams, batch_size): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer model_path = get_model_path(repo_id, local_model_hub) diff --git a/python/llm/dev/benchmark/all-in-one/save.py b/python/llm/dev/benchmark/all-in-one/save.py index ea3ed638..48aa3d98 100644 --- a/python/llm/dev/benchmark/all-in-one/save.py +++ b/python/llm/dev/benchmark/all-in-one/save.py @@ -30,7 +30,7 @@ current_dir = os.path.dirname(os.path.realpath(__file__)) def save_model_in_low_bit(repo_id, local_model_hub, low_bit): - from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer model_path = get_model_path(repo_id, local_model_hub) # Load model in 4 bit, diff --git a/python/llm/dev/benchmark/ceval/eval.py b/python/llm/dev/benchmark/ceval/eval.py index e0530d46..8f8637ed 100644 --- a/python/llm/dev/benchmark/ceval/eval.py +++ b/python/llm/dev/benchmark/ceval/eval.py @@ -21,7 +21,7 @@ import torch import json from tqdm import tqdm -from bigdl.llm.utils.common.log4Error import invalidInputError +from ipex_llm.utils.common.log4Error import invalidInputError from evaluators.qwen import QwenEvaluator from evaluators.llama import LlamaEvaluator from evaluators.chatglm import ChatGLMEvaluator diff --git a/python/llm/dev/benchmark/ceval/evaluators/chatglm.py b/python/llm/dev/benchmark/ceval/evaluators/chatglm.py index 717f386f..2c0b5ec7 100644 --- a/python/llm/dev/benchmark/ceval/evaluators/chatglm.py +++ b/python/llm/dev/benchmark/ceval/evaluators/chatglm.py @@ -22,7 +22,7 @@ from thefuzz import process from transformers import AutoTokenizer from evaluators.evaluator import Evaluator -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers.generation.utils import LogitsProcessorList from transformers.generation.logits_process import LogitsProcessor diff --git a/python/llm/dev/benchmark/ceval/evaluators/llama.py b/python/llm/dev/benchmark/ceval/evaluators/llama.py index ba1dfc3e..c6944f72 100644 --- a/python/llm/dev/benchmark/ceval/evaluators/llama.py +++ b/python/llm/dev/benchmark/ceval/evaluators/llama.py @@ -22,7 +22,7 @@ import numpy as np import torch from transformers import LlamaTokenizer, GenerationConfig -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from evaluators.evaluator import Evaluator diff --git a/python/llm/dev/benchmark/ceval/evaluators/qwen.py b/python/llm/dev/benchmark/ceval/evaluators/qwen.py index 561bb6da..dcb1ee91 100644 --- a/python/llm/dev/benchmark/ceval/evaluators/qwen.py +++ b/python/llm/dev/benchmark/ceval/evaluators/qwen.py @@ -22,7 +22,7 @@ from thefuzz import process from transformers import AutoTokenizer from transformers.generation import GenerationConfig -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from evaluators.evaluator import Evaluator diff --git a/python/llm/dev/benchmark/harness/bigdl_llm.py b/python/llm/dev/benchmark/harness/bigdl_llm.py index b370301e..8626fc1a 100644 --- a/python/llm/dev/benchmark/harness/bigdl_llm.py +++ b/python/llm/dev/benchmark/harness/bigdl_llm.py @@ -14,7 +14,7 @@ # limitations under the License. # -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM import inspect from lm_eval.models.huggingface import AutoCausalLM diff --git a/python/llm/dev/benchmark/perplexity/ppl.py b/python/llm/dev/benchmark/perplexity/ppl.py index 672a5c19..1b71d9fe 100644 --- a/python/llm/dev/benchmark/perplexity/ppl.py +++ b/python/llm/dev/benchmark/perplexity/ppl.py @@ -20,7 +20,7 @@ from torch.nn import CrossEntropyLoss from tqdm import tqdm import gc -from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel +from ipex_llm.transformers import AutoModelForCausalLM, AutoModel class BigDLPPL: def __init__(self, model_path, device, **model_kwargs) -> None: diff --git a/python/llm/dev/benchmark/perplexity/run.py b/python/llm/dev/benchmark/perplexity/run.py index 27c22112..d548e984 100644 --- a/python/llm/dev/benchmark/perplexity/run.py +++ b/python/llm/dev/benchmark/perplexity/run.py @@ -21,7 +21,7 @@ from datasets import concatenate_datasets, load_dataset from transformers import AutoTokenizer from ppl import BigDLPPL -from bigdl.llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.ggml.quantize import ggml_tensor_qtype import os import json diff --git a/python/llm/dev/benchmark/whisper/run_whisper.py b/python/llm/dev/benchmark/whisper/run_whisper.py index 286025d5..97705920 100644 --- a/python/llm/dev/benchmark/whisper/run_whisper.py +++ b/python/llm/dev/benchmark/whisper/run_whisper.py @@ -15,7 +15,7 @@ # from datasets import load_dataset -from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq +from ipex_llm.transformers import AutoModelForSpeechSeq2Seq from transformers import WhisperProcessor import torch from evaluate import load diff --git a/python/llm/example/CPU/Applications/autogen/README.md b/python/llm/example/CPU/Applications/autogen/README.md index 4e112f9c..41e39727 100644 --- a/python/llm/example/CPU/Applications/autogen/README.md +++ b/python/llm/example/CPU/Applications/autogen/README.md @@ -69,11 +69,11 @@ conda activate autogen cd autogen # load the local model with cpu with your downloaded model -python -m bigdl.llm.serving.model_worker --model-path ... --device cpu +python -m ipex_llm.serving.model_worker --model-path ... --device cpu ``` Change the Model Name: -> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m bigdl.llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat. +> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m ipex_llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat. Potential Error Note: > If you get `RuntimeError: Error register to Controller` in the worker terminal, please set `export no_proxy='localhost'` to ensure the registration diff --git a/python/llm/example/CPU/Applications/hf-agent/run_agent.py b/python/llm/example/CPU/Applications/hf-agent/run_agent.py index a9a57373..8517fc58 100644 --- a/python/llm/example/CPU/Applications/hf-agent/run_agent.py +++ b/python/llm/example/CPU/Applications/hf-agent/run_agent.py @@ -19,7 +19,7 @@ import argparse from PIL import Image from transformers import AutoTokenizer, LocalAgent -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run agent using vicuna model") diff --git a/python/llm/example/CPU/Applications/streaming-llm/README.md b/python/llm/example/CPU/Applications/streaming-llm/README.md index 75c3a202..0bc1a627 100644 --- a/python/llm/example/CPU/Applications/streaming-llm/README.md +++ b/python/llm/example/CPU/Applications/streaming-llm/README.md @@ -3,7 +3,7 @@ In this example, we apply low-bit optimizations to [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs. Only one code change is needed to load the model using bigdl-llm as follows: ```python -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False) ``` diff --git a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py index 2aa1ded1..163ccc71 100644 --- a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py +++ b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py @@ -49,7 +49,7 @@ import urllib.request import os import json # code change to import from bigdl-llm API instead of using transformers API -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer import intel_extension_for_pytorch as ipex diff --git a/python/llm/example/CPU/Deepspeed-AutoTP/README.md b/python/llm/example/CPU/Deepspeed-AutoTP/README.md index fcd9f1be..8cc4d7ab 100644 --- a/python/llm/example/CPU/Deepspeed-AutoTP/README.md +++ b/python/llm/example/CPU/Deepspeed-AutoTP/README.md @@ -39,7 +39,7 @@ Distributed model managed by deepspeed can be further optimized with BigDL low-b ```python # Apply BigDL-LLM INT4 optimizations on transformers -from bigdl.llm import optimize_model +from ipex_llm import optimize_model model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4') model = model.to(f'cpu:{local_rank}') # move partial model to local rank diff --git a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py index 91d11dfa..d42f3887 100644 --- a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -45,7 +45,7 @@ import os import torch from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer import deepspeed -from bigdl.llm import optimize_model +from ipex_llm import optimize_model import torch import intel_extension_for_pytorch as ipex import time diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py index 42cb6ed5..37843751 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py index 3842164a..4acad805 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py index 70ccef6d..1f5852b6 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer, GPTQConfig # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py index 4d6aebc3..69d9045f 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py index 6badf85b..b9bc0ee2 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py index bdaa7f7a..df64f80e 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py index 8d1cce0c..59dccfe8 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227 diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py index b5812ba5..07a4359e 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py index b5f5ab6e..e38f56c4 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py index 18f6a863..fb1423fa 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py index 3bbf5333..5cab690d 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py index 9372ed8a..d3d8daae 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py index 3006299d..5094a66b 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py index 3a383de7..b8329d61 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import CodeLlamaTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py index 50a542f6..adc79339 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py index 6f5ede1f..b82ddc7f 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoTokenizer -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/generate.py index 87cca75f..6bce7e4f 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/generate.py @@ -36,7 +36,7 @@ if __name__ == '__main__': # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True).eval() model.generation_config = GenerationConfig.from_pretrained(model_path) model.generation_config.pad_token_id = model.generation_config.eos_token_id diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py index 8a14e335..679fe2e6 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py index e954ed78..77088018 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py @@ -17,7 +17,7 @@ import time import argparse -from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq +from ipex_llm.transformers import AutoModelForSpeechSeq2Seq from datasets import load_dataset from transformers import pipeline from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py index a159a98c..ee043e0b 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py index 87298307..18867636 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py index 1f582bcc..f6776860 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py @@ -301,7 +301,7 @@ class Attention(nn.Module): # resize qk to 4D to match apply_rotary_pos_emb_no_cache_xpu's requirements. query_layer = query_layer.reshape(batch_size, self.num_heads, q_length, self.head_dim) key_layer = key_layer.reshape(batch_size, self.num_kv, q_length, self.head_dim) - from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu + from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu query_layer, key_layer = apply_rotary_pos_emb_no_cache_xpu(query_layer, key_layer, position_ids, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py index 1c2cab8c..6419aa5a 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py index 58d5e446..91b8addc 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForSeq2SeqLM +from ipex_llm.transformers import AutoModelForSeq2SeqLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py index 7fe83502..271b1d4f 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py @@ -19,7 +19,7 @@ import torch import argparse import time from PIL import Image -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Fuyu model') diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py index 0c1539ff..4606e2b5 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # The instruction-tuned models use a chat template that must be adhered to for conversational use. diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py index dd824043..6834f582 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py @@ -14,14 +14,14 @@ # limitations under the License. # -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer from transformers.generation import GenerationConfig import torch import time import os import argparse -from bigdl.llm import optimize_model +from ipex_llm import optimize_model if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for InternLM-XComposer model') diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py index 1817079d..1c33a1fe 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py index 00ffec57..7e05e153 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py @@ -39,7 +39,7 @@ if __name__ == '__main__': model_path = args.repo_id_or_model_path - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py index 1d6c56e2..ed5c93b0 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py index 72e4e269..94e6ab48 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py index 5e1d4065..cd8b9f60 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py index 73ec0837..786f7f0e 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py index eb70ea70..e4caa938 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer, GenerationConfig # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py index e934d99b..710d2a39 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM +from ipex_llm.transformers import AutoModel,AutoModelForCausalLM from transformers import AutoTokenizer, GenerationConfig # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py index 53a95623..91930b72 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM +from ipex_llm.transformers import AutoModel,AutoModelForCausalLM from transformers import AutoTokenizer, GenerationConfig # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py index 276fa09e..395481ae 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py @@ -20,7 +20,7 @@ import argparse import numpy as np from transformers import AutoTokenizer, GenerationConfig -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to # TODO: https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py PHI1_5_PROMPT_FORMAT = " Question:{prompt}\n\n Answer:" @@ -41,7 +41,7 @@ if __name__ == '__main__': # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py index e22733e3..264f27e2 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could revise it based on the Phoenix model you choose to use diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py index 6c017755..60d71a89 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py @@ -14,14 +14,14 @@ # limitations under the License. # -from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM +from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer from transformers.generation import GenerationConfig import torch import time import os import argparse -from bigdl.llm import optimize_model +from ipex_llm import optimize_model torch.manual_seed(1234) if __name__ == '__main__': diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py index 8143e34c..4f260181 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py index becfb0cc..2b1cebf3 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py @@ -36,7 +36,7 @@ if __name__ == '__main__': model_path = args.repo_id_or_model_path - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py index 08154485..da5f69ee 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py index 014d5f13..0599df2c 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py index 7a0ae024..47bc1c79 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py index 91ec5000..b84d7b61 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py index d1291d7d..e6a80a71 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py index 2b76f478..118b6084 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py index 7e67ae45..942e7f14 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py @@ -19,7 +19,7 @@ import librosa import argparse from transformers import pipeline -from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq +from ipex_llm.transformers import AutoModelForSpeechSeq2Seq from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py index e4a1185d..60de9751 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq +from ipex_llm.transformers import AutoModelForSpeechSeq2Seq from transformers import WhisperProcessor from datasets import load_dataset diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py index 52f61a5b..72d5dd97 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer WIZARDCODERPYTHON_PROMPT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request. diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py index c7024b3c..f809c44b 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py index 1c792931..46115bc0 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py @@ -18,7 +18,7 @@ import torch, transformers import sys, os, time import argparse from transformers import LlamaTokenizer -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM # Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage YUAN2_PROMPT_FORMAT = """ diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py index ab5708e8..cf4914c2 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py @@ -39,7 +39,7 @@ if __name__ == '__main__': model_path = args.repo_id_or_model_path - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; # to obtain optimal performance with BigDL-LLM INT4 optimizations, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py b/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py index 9cf9cffb..02ea399e 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py @@ -15,7 +15,7 @@ # import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer, TextGenerationPipeline if __name__ == '__main__': @@ -38,7 +38,7 @@ if __name__ == '__main__': model = AutoModelForCausalLM.load_low_bit(load_path) tokenizer = LlamaTokenizer.from_pretrained(load_path) else: - # load_in_low_bit in bigdl.llm.transformers will convert + # load_in_low_bit in ipex_llm.transformers will convert # the relevant layers in the model into corresponding int X format model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py index 9cf9cffb..02ea399e 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py @@ -15,7 +15,7 @@ # import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer, TextGenerationPipeline if __name__ == '__main__': @@ -38,7 +38,7 @@ if __name__ == '__main__': model = AutoModelForCausalLM.load_low_bit(load_path) tokenizer = LlamaTokenizer.from_pretrained(load_path) else: - # load_in_low_bit in bigdl.llm.transformers will convert + # load_in_low_bit in ipex_llm.transformers will convert # the relevant layers in the model into corresponding int X format model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/CPU/LangChain/native_int4/docqa.py b/python/llm/example/CPU/LangChain/native_int4/docqa.py index 8f3b4bdd..ce7cf8f1 100644 --- a/python/llm/example/CPU/LangChain/native_int4/docqa.py +++ b/python/llm/example/CPU/LangChain/native_int4/docqa.py @@ -31,8 +31,8 @@ from langchain.chains.question_answering import load_qa_chain from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler -from bigdl.llm.langchain.llms import * -from bigdl.llm.langchain.embeddings import * +from ipex_llm.langchain.llms import * +from ipex_llm.langchain.embeddings import * def main(args): diff --git a/python/llm/example/CPU/LangChain/native_int4/streamchat.py b/python/llm/example/CPU/LangChain/native_int4/streamchat.py index baa94db1..a0127dd0 100644 --- a/python/llm/example/CPU/LangChain/native_int4/streamchat.py +++ b/python/llm/example/CPU/LangChain/native_int4/streamchat.py @@ -21,7 +21,7 @@ import argparse -from bigdl.llm.langchain.llms import * +from ipex_llm.langchain.llms import * from langchain import PromptTemplate, LLMChain from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler diff --git a/python/llm/example/CPU/LangChain/native_int4/voiceassistant.py b/python/llm/example/CPU/LangChain/native_int4/voiceassistant.py index 80718c50..c41666ca 100644 --- a/python/llm/example/CPU/LangChain/native_int4/voiceassistant.py +++ b/python/llm/example/CPU/LangChain/native_int4/voiceassistant.py @@ -23,7 +23,7 @@ from langchain import LLMChain, PromptTemplate -from bigdl.llm.langchain.llms import * +from ipex_llm.langchain.llms import * from langchain.memory import ConversationBufferWindowMemory from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler diff --git a/python/llm/example/CPU/LangChain/transformers_int4/chat.py b/python/llm/example/CPU/LangChain/transformers_int4/chat.py index e6be1ca1..b3695199 100644 --- a/python/llm/example/CPU/LangChain/transformers_int4/chat.py +++ b/python/llm/example/CPU/LangChain/transformers_int4/chat.py @@ -21,7 +21,7 @@ import argparse -from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM +from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM from langchain import PromptTemplate, LLMChain from langchain import HuggingFacePipeline diff --git a/python/llm/example/CPU/LangChain/transformers_int4/llm_math.py b/python/llm/example/CPU/LangChain/transformers_int4/llm_math.py index 456ac567..567b0071 100644 --- a/python/llm/example/CPU/LangChain/transformers_int4/llm_math.py +++ b/python/llm/example/CPU/LangChain/transformers_int4/llm_math.py @@ -25,7 +25,7 @@ import argparse from langchain.chains import LLMMathChain -from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM +from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM def main(args): diff --git a/python/llm/example/CPU/LangChain/transformers_int4/rag.py b/python/llm/example/CPU/LangChain/transformers_int4/rag.py index 7d9ae45f..960d23b6 100644 --- a/python/llm/example/CPU/LangChain/transformers_int4/rag.py +++ b/python/llm/example/CPU/LangChain/transformers_int4/rag.py @@ -30,8 +30,8 @@ from langchain.text_splitter import CharacterTextSplitter from langchain.chains.question_answering import load_qa_chain from langchain.callbacks.manager import CallbackManager -from bigdl.llm.langchain.llms import TransformersLLM -from bigdl.llm.langchain.embeddings import TransformersEmbeddings +from ipex_llm.langchain.llms import TransformersLLM +from ipex_llm.langchain.embeddings import TransformersEmbeddings text_doc = ''' BigDL seamlessly scales your data analytics & AI applications from laptop to cloud, with the following libraries: diff --git a/python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py b/python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py index 7c649fdb..279a14e5 100644 --- a/python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py +++ b/python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py @@ -23,9 +23,9 @@ from langchain import LLMChain, PromptTemplate -from bigdl.llm.langchain.llms import TransformersLLM +from ipex_llm.langchain.llms import TransformersLLM from langchain.memory import ConversationBufferWindowMemory -from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq +from ipex_llm.transformers import AutoModelForSpeechSeq2Seq from transformers import WhisperProcessor import speech_recognition as sr import numpy as np diff --git a/python/llm/example/CPU/LlamaIndex/rag.py b/python/llm/example/CPU/LlamaIndex/rag.py index 9fd81ca1..c4c4c8f8 100644 --- a/python/llm/example/CPU/LlamaIndex/rag.py +++ b/python/llm/example/CPU/LlamaIndex/rag.py @@ -164,7 +164,7 @@ def main(args): embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path) # Use custom LLM in BigDL - from bigdl.llm.llamaindex.llms import BigdlLLM + from ipex_llm.llamaindex.llms import BigdlLLM llm = BigdlLLM( model_name=args.model_path, tokenizer_name=args.model_path, diff --git a/python/llm/example/CPU/ModelScope-Models/generate.py b/python/llm/example/CPU/ModelScope-Models/generate.py index 3fef46fd..274566f3 100644 --- a/python/llm/example/CPU/ModelScope-Models/generate.py +++ b/python/llm/example/CPU/ModelScope-Models/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from modelscope import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py index fe55784e..aa349c29 100644 --- a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py +++ b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py @@ -16,11 +16,11 @@ import time import argparse -from bigdl.llm.transformers import * +from ipex_llm.transformers import * def convert(repo_id_or_model_path, model_family, tmp_path): - from bigdl.llm import llm_convert + from ipex_llm import llm_convert original_llm_path = repo_id_or_model_path bigdl_llm_path = llm_convert( model=original_llm_path, diff --git a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py index 6a9c7fde..9b219452 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/BAAI/AquilaChat2-7B/tree/main/predict.py diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py b/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py index bb8a61b8..1811c36b 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py @@ -20,7 +20,7 @@ import argparse from TTS.tts.configs.bark_config import BarkConfig from TTS.tts.models.bark import Bark -from bigdl.llm import optimize_model +from ipex_llm import optimize_model if __name__ == '__main__': diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py b/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py index aab47ff7..cd0c73b7 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py @@ -19,7 +19,7 @@ import time import argparse from transformers import BertTokenizer, BertModel -from bigdl.llm import optimize_model +from ipex_llm import optimize_model if __name__ == '__main__': diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py index 97d66281..d16d2331 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model BLUELM_PROMPT_FORMAT = "[|Human|]:{prompt}[|AI|]:" diff --git a/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py index d3f4b6cd..89d26761 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModel, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/THUDM/chatglm-6b/blob/294cb13118a1e08ad8449ca542624a5c6aecc401/modeling_chatglm.py#L1281 diff --git a/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py index 72a0ab99..22fdeaad 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModel, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://github.com/THUDM/ChatGLM3/blob/main/PROMPT.md diff --git a/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py index f217676a..12266e99 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, CodeLlamaTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/docs/transformers/v4.34.1/model_doc/code_llama diff --git a/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py index cd235025..a0610bc6 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoTokenizer, AutoModelForCausalLM -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/WisdomShell/CodeShell-7B diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py index 2f0cdc72..8714b419 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/generate.py index d0ca949a..af5ec2f0 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/generate.py @@ -36,7 +36,7 @@ if __name__ == '__main__': from transformers import AutoModelForCausalLM - from bigdl.llm import optimize_model + from ipex_llm import optimize_model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype = torch.bfloat16, device_map = "auto", attn_implementation="eager") model.generation_config = GenerationConfig.from_pretrained(model_path) model.generation_config.pad_token_id = model.generation_config.eos_token_id diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py index b64eaa9b..1a2cbaec 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # Refer to https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct diff --git a/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/recognize.py b/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/recognize.py index 338db3f6..bcd7b852 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/recognize.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/recognize.py @@ -17,7 +17,7 @@ import time import argparse -from bigdl.llm import optimize_model +from ipex_llm import optimize_model from datasets import load_dataset from transformers import AutoModelForSpeechSeq2Seq, pipeline from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py index 51ba2500..35e1b25d 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForSeq2SeqLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, FLAN_T5_PROMPT_FORMAT = "<|User|>:{prompt}" diff --git a/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py index 234e4741..8e2397ba 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py @@ -19,7 +19,7 @@ import torch import argparse import time from PIL import Image -from bigdl.llm import optimize_model +from ipex_llm import optimize_model if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Fuyu model') diff --git a/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py b/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py index 3463eb3a..dc664493 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py @@ -20,7 +20,7 @@ import torch import time import os import argparse -from bigdl.llm import optimize_model +from ipex_llm import optimize_model if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for InternLM-XComposer model') diff --git a/python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py index afca8397..7cfb80c5 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm import optimize_model +from ipex_llm import optimize_model from transformers import AutoTokenizer # you could tune the prompt based on your own model, @@ -40,7 +40,7 @@ if __name__ == '__main__': model_path = args.repo_id_or_model_path - from bigdl.llm import optimize_model + from ipex_llm import optimize_model from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py index b2c5ca70..6c4ab17a 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, LlamaTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py index d5d0d2ce..c27d1a50 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py @@ -56,7 +56,7 @@ from llava.mm_utils import ( KeywordsStoppingCriteria ) -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # Load the pretrained model. # Adapted from llava.model.builder.load_pretrained_model. diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py index abc050a2..9462474a 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py @@ -17,7 +17,7 @@ import argparse import time import torch -from bigdl.llm import optimize_model +from ipex_llm import optimize_model from transformers import AutoTokenizer from model import MambaLMHeadModel diff --git a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/cpu.patch b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/cpu.patch index fab864f8..d7b5e9dd 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/cpu.patch +++ b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/cpu.patch @@ -36,7 +36,7 @@ index acedf44..df4e5d6 100644 from llama import Llama, Dialog --from bigdl.llm.optimize import optimize_model +-from ipex_llm.optimize import optimize_model - def main( @@ -67,7 +67,7 @@ index 1f63bb0..0d60b9c 100755 from llama import Llama from typing import List --from bigdl.llm.optimize import optimize_model +-from ipex_llm.optimize import optimize_model - def main( ckpt_dir: str, diff --git a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_chat_completion.py b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_chat_completion.py index d50f1608..dd4863d2 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_chat_completion.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_chat_completion.py @@ -24,7 +24,7 @@ import fire from llama import Llama, Dialog -from bigdl.llm.optimize import optimize_model +from ipex_llm.optimize import optimize_model def main( diff --git a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_text_completion.py b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_text_completion.py index 9342cfce..3744540b 100755 --- a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_text_completion.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_text_completion.py @@ -23,7 +23,7 @@ import fire from llama import Llama from typing import List -from bigdl.llm.optimize import optimize_model +from ipex_llm.optimize import optimize_model def main( ckpt_dir: str, diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py index 6fa1522a..37958b67 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py index d79e8a72..557f54c8 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1#instruction-format diff --git a/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py index ca2c5dc4..1b071d57 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py @@ -19,7 +19,7 @@ import whisper import time import librosa import argparse -from bigdl.llm import optimize_model +from ipex_llm import optimize_model if __name__ == '__main__': diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py index f819a47f..f70da15d 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, GenerationConfig -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py index a4f54355..319c009f 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, GenerationConfig -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/microsoft/phi-2 diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phixtral/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/phixtral/generate.py index e66863ad..75f4ba6a 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/phixtral/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/phixtral/generate.py @@ -20,7 +20,7 @@ import argparse import numpy as np from transformers import AutoTokenizer, GenerationConfig -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to # TODO: https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py PHI1_5_PROMPT_FORMAT = " Question:{prompt}\n\n Answer:" diff --git a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py index 5502a697..6ed3adec 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py @@ -20,7 +20,7 @@ import torch import time import os import argparse -from bigdl.llm import optimize_model +from ipex_llm import optimize_model torch.manual_seed(1234) if __name__ == '__main__': diff --git a/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/generate.py index da769fc3..82f97ae2 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - from bigdl.llm import optimize_model + from ipex_llm import optimize_model model = optimize_model(model) prompt = args.prompt diff --git a/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py index 8528de69..fa52a2e9 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, SKYWORK_PROMPT_FORMAT = "{prompt}" diff --git a/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py index 612d9aca..2ddd48af 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # prompt format is tuned based on the output example in this link: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py index ed9da6ff..832f7623 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, LlamaTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model WIZARDCODERPYTHON_PROMPT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request. diff --git a/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py index ddfe6d49..bf6af053 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm import optimize_model +from ipex_llm import optimize_model from transformers import AutoModelForCausalLM, AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py index 5ebbe21d..ea71ad76 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py @@ -18,7 +18,7 @@ import torch, transformers import sys, os, time import argparse from transformers import LlamaTokenizer, AutoModelForCausalLM -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage YUAN2_PROMPT_FORMAT = """ diff --git a/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py index 16046951..e6f2c02d 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py @@ -39,7 +39,7 @@ if __name__ == '__main__': from transformers import AutoModelForCausalLM - from bigdl.llm import optimize_model + from ipex_llm import optimize_model # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; # to obtain optimal performance with BigDL-LLM `optimization_model` API optimizations, diff --git a/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py b/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py index 5e1b67cc..59c01f63 100644 --- a/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, LlamaTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style diff --git a/python/llm/example/CPU/PyTorch-Models/Save-Load/generate.py b/python/llm/example/CPU/PyTorch-Models/Save-Load/generate.py index ccbee8aa..6d13258d 100644 --- a/python/llm/example/CPU/PyTorch-Models/Save-Load/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Save-Load/generate.py @@ -17,8 +17,8 @@ import torch import time import argparse -from bigdl.llm import optimize_model -from bigdl.llm.optimize import low_memory_init, load_low_bit +from ipex_llm import optimize_model +from ipex_llm.optimize import low_memory_init, load_low_bit from transformers import AutoModelForCausalLM, LlamaTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py index 3e3fd34d..cdf3196c 100644 --- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py +++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py @@ -47,11 +47,11 @@ from peft import ( from utils.prompter import Prompter from transformers import BitsAndBytesConfig -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM -# import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model -from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig -from bigdl.llm.utils.isa_checker import ISAChecker +# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model +from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig +from ipex_llm.utils.isa_checker import ISAChecker def get_int_from_env(env_keys, default): """Returns the first positive env value found in the `env_keys` list or the default.""" diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/utils/prompter.py b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/utils/prompter.py index 33355129..86d6422d 100644 --- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/utils/prompter.py +++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/utils/prompter.py @@ -34,7 +34,7 @@ import json import os.path as osp from typing import Union -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError class Prompter(object): diff --git a/python/llm/example/CPU/QLoRA-FineTuning/qlora_finetuning_cpu.py b/python/llm/example/CPU/QLoRA-FineTuning/qlora_finetuning_cpu.py index 1a8c6054..6b177056 100644 --- a/python/llm/example/CPU/QLoRA-FineTuning/qlora_finetuning_cpu.py +++ b/python/llm/example/CPU/QLoRA-FineTuning/qlora_finetuning_cpu.py @@ -21,11 +21,11 @@ import transformers from transformers import LlamaTokenizer from transformers import BitsAndBytesConfig -from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig +from ipex_llm.transformers import AutoModelForCausalLM from datasets import load_dataset import argparse -from bigdl.llm.utils.isa_checker import ISAChecker +from ipex_llm.utils.isa_checker import ISAChecker current_dir = os.path.dirname(os.path.realpath(__file__)) common_util_path = os.path.join(current_dir, '..', '..', 'GPU', 'LLM-Finetuning') diff --git a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py index bdc1e533..1010618c 100644 --- a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM +from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py index b60b87b7..971e60e6 100644 --- a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py index 5c827e55..5e3c5f8b 100644 --- a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM +from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import LlamaTokenizer, AutoTokenizer import argparse import time diff --git a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py index ba9d2d10..714eb430 100644 --- a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py b/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py index d3e71739..c92b8512 100644 --- a/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM +from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py index 08305401..0bcd026e 100644 --- a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py index 265eadf2..279f3550 100644 --- a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM +from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py index 0d35294d..6db383c4 100644 --- a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/CPU/vLLM-Serving/README.md b/python/llm/example/CPU/vLLM-Serving/README.md index 8d650d8a..a7e6ec1e 100644 --- a/python/llm/example/CPU/vLLM-Serving/README.md +++ b/python/llm/example/CPU/vLLM-Serving/README.md @@ -56,7 +56,7 @@ To fully utilize the continuous batching feature of the `vLLM`, you can send req #!/bin/bash # You may also want to adjust the `--max-num-batched-tokens` argument, it indicates the hard limit # of batched prompt length the server will accept -numactl -C 48-95 -m 1 python -m bigdl.llm.vllm.entrypoints.openai.api_server \ +numactl -C 48-95 -m 1 python -m ipex_llm.vllm.entrypoints.openai.api_server \ --model /MODEL_PATH/Llama-2-7b-chat-hf-bigdl/ --port 8000 \ --load-format 'auto' --device cpu --dtype bfloat16 \ --load-in-low-bit sym_int4 \ diff --git a/python/llm/example/CPU/vLLM-Serving/offline_inference.py b/python/llm/example/CPU/vLLM-Serving/offline_inference.py index 84ecb5a1..00fe1f55 100644 --- a/python/llm/example/CPU/vLLM-Serving/offline_inference.py +++ b/python/llm/example/CPU/vLLM-Serving/offline_inference.py @@ -31,8 +31,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from bigdl.llm.vllm.entrypoints.llm import LLM -from bigdl.llm.vllm.sampling_params import SamplingParams +from ipex_llm.vllm.entrypoints.llm import LLM +from ipex_llm.vllm.sampling_params import SamplingParams # Sample prompts. prompts = [ diff --git a/python/llm/example/GPU/Applications/autogen/README.md b/python/llm/example/GPU/Applications/autogen/README.md index f18aa3cb..7ac7c4eb 100644 --- a/python/llm/example/GPU/Applications/autogen/README.md +++ b/python/llm/example/GPU/Applications/autogen/README.md @@ -71,11 +71,11 @@ conda activate autogen cd autogen # load the local model with xpu with your downloaded model -python -m bigdl.llm.serving.model_worker --model-path ... --device xpu +python -m ipex_llm.serving.model_worker --model-path ... --device xpu ``` Model Name Note: -> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m bigdl.llm.serving.model_worker --model-path ... --device xpu`. This ensures the proper usage of the BigDL-adapted FastChat. +> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m ipex_llm.serving.model_worker --model-path ... --device xpu`. This ensures the proper usage of the BigDL-adapted FastChat. Device Note: > Please set `--device` to `xpu` to enable the Intel GPU usage. diff --git a/python/llm/example/GPU/Applications/streaming-llm/README.md b/python/llm/example/GPU/Applications/streaming-llm/README.md index 54aa89e2..c783bc09 100644 --- a/python/llm/example/GPU/Applications/streaming-llm/README.md +++ b/python/llm/example/GPU/Applications/streaming-llm/README.md @@ -3,7 +3,7 @@ In this example, we apply low-bit optimizations to [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs. Only one code change is needed to load the model using bigdl-llm as follows: ```python -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False) ``` diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py index 54994c0f..a64693e8 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -34,7 +34,7 @@ os.environ["WORLD_SIZE"] = str(world_size) os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500") -from bigdl.llm import optimize_model +from ipex_llm import optimize_model import torch import time diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py index 7402a378..c6ff5241 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py @@ -17,7 +17,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py index f6ca2511..d8342d8e 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py @@ -17,7 +17,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer import warnings diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py index e8ee9c36..9272b727 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import LlamaTokenizer -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py index 0f595d6e..4317730f 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py @@ -17,7 +17,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer, GPTQConfig # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py index 0f233796..ec729d12 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py index c423904b..cc75bba7 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py index 18c0e10d..4e34654e 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py index ac004a52..37e65743 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227 diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py index d34ff2ef..2d30cd29 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py index b3f86e90..1da87c11 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py index 1e7ee00e..36752e87 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py index 35ecfb49..109b40ca 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py index 9fbbd16c..3bf7fd7f 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py index da977404..cf0e554f 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py index 6de2f2d7..8192d78e 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import CodeLlamaTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py index 5a77812c..9aeed3ba 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py @@ -28,7 +28,7 @@ from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from transformers import AutoTokenizer # from transformers import AutoModelForCausalLM, AutoModel -from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel +from ipex_llm.transformers import AutoModelForCausalLM, AutoModel from transformers.generation import GenerationConfig, TextIteratorStreamer from transformers import StoppingCriteriaList, StoppingCriteria from sse_starlette.sse import EventSourceResponse diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py index b9a63832..728ae71f 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoTokenizer -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deepseek/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deepseek/generate.py index 7a7d9aaf..802bb284 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deepseek/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deepseek/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoTokenizer -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py index 67e3cbe4..935216fd 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py @@ -18,7 +18,7 @@ import time import argparse from transformers import pipeline -from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq +from ipex_llm.transformers import AutoModelForSpeechSeq2Seq from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer from datasets import load_dataset diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py index b4a9c439..21d2a43b 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py index 93729f74..b5182e39 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py index 1f582bcc..f6776860 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py @@ -301,7 +301,7 @@ class Attention(nn.Module): # resize qk to 4D to match apply_rotary_pos_emb_no_cache_xpu's requirements. query_layer = query_layer.reshape(batch_size, self.num_heads, q_length, self.head_dim) key_layer = key_layer.reshape(batch_size, self.num_kv, q_length, self.head_dim) - from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu + from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu query_layer, key_layer = apply_rotary_pos_emb_no_cache_xpu(query_layer, key_layer, position_ids, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py index 81229c5e..85669e7c 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py index 7ebfc1f2..c0f0773b 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForSeq2SeqLM +from ipex_llm.transformers import AutoModelForSeq2SeqLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py index c8abc40f..5a328377 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # The instruction-tuned models use a chat template that must be adhered to for conversational use. diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py index d9da4b5c..87fd5c97 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py index 99e5b52f..ce25713b 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py index fdbd312c..7a751793 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model, @@ -43,7 +43,7 @@ if __name__ == '__main__': # which convert the relevant layers in the model into INT4 format # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py index e9095acc..2fa0c281 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py index faecbcf3..030ea9b2 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py index 79cc9995..3b52a5d4 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py index ff9b4b06..20e196a4 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer, GenerationConfig # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py index 0437fa5f..5bd9dbd5 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM +from ipex_llm.transformers import AutoModel,AutoModelForCausalLM from transformers import AutoTokenizer, GenerationConfig # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py index d5aa3a74..b199f08d 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM +from ipex_llm.transformers import AutoModel,AutoModelForCausalLM from transformers import AutoTokenizer, GenerationConfig # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py index e806ba54..cbe01ebe 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py @@ -45,7 +45,7 @@ if __name__ == '__main__': # which convert the relevant layers in the model into INT4 format # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. - from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM + from ipex_llm.transformers import AutoModel,AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py index 4701eb5a..03127136 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py @@ -21,7 +21,7 @@ import torch from transformers import AutoTokenizer from transformers.generation import GenerationConfig -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM torch.manual_seed(1234) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py index 182a093f..7fb477a2 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py index 423f82aa..557b0d55 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model import intel_extension_for_pytorch as ipex import numpy as np @@ -38,7 +38,7 @@ if __name__ == '__main__': model_path = args.repo_id_or_model_path - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format model = AutoModelForCausalLM.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py index f6be1ff9..39d97ee1 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py index 5720a3eb..ada97daf 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py index 2152b38d..3158bc2d 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py index 7099ab1b..7591abd0 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py index 7dd54586..ef37b844 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py @@ -19,7 +19,7 @@ import intel_extension_for_pytorch as ipex import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py index 8be048ff..a29e80ef 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py index 387d815f..df9686f8 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py index 57db00f2..33d3e651 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py @@ -24,8 +24,8 @@ import inquirer # For Windows users, please remove `import sounddevice` import sounddevice -from bigdl.llm.transformers import AutoModelForCausalLM -from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq +from ipex_llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForSpeechSeq2Seq from transformers import LlamaTokenizer from transformers import WhisperProcessor from transformers import TextStreamer diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py index 41729c65..4a0ca795 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq +from ipex_llm.transformers import AutoModelForSpeechSeq2Seq from transformers import WhisperProcessor from datasets import load_dataset diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py index 4af84098..8bf80fdc 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer # Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py index c01cf828..e84e0e46 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py @@ -19,7 +19,7 @@ import sys, os, time import intel_extension_for_pytorch as ipex import argparse from transformers import LlamaTokenizer -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM # Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage YUAN2_PROMPT_FORMAT = """ diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py b/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py index 7a2f61a5..f43f0c3e 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py @@ -16,7 +16,7 @@ import torch import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer, TextGenerationPipeline if __name__ == '__main__': @@ -40,7 +40,7 @@ if __name__ == '__main__': model = model.to('xpu') tokenizer = AutoTokenizer.from_pretrained(load_path) else: - # load_in_low_bit in bigdl.llm.transformers will convert + # load_in_low_bit in ipex_llm.transformers will convert # the relevant layers in the model into corresponding int X format model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True) model = model.to('xpu') diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py index ee36132a..4a5c4f51 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py @@ -17,7 +17,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py index 3d0708b5..cb4a35f6 100644 --- a/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py @@ -38,8 +38,8 @@ import transformers from transformers import AutoTokenizer, TrainingArguments, BitsAndBytesConfig from datasets import load_dataset from peft import LoraConfig -from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training +from ipex_llm.transformers import AutoModelForCausalLM from trl import DPOTrainer import argparse diff --git a/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py b/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py index d6360fd1..a829cd40 100644 --- a/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py +++ b/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py @@ -30,7 +30,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from bigdl.llm import llm_patch +from ipex_llm import llm_patch llm_patch(train=True) # The following is the original LLM finetuning code using PEFT (without BigDL-LLM) diff --git a/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py index 2f238d2c..4af84ed6 100644 --- a/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py @@ -52,11 +52,11 @@ sys.path.append(common_util_path) from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_data from transformers import BitsAndBytesConfig -from bigdl.llm.transformers import AutoModelForCausalLM -# import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model -from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ +from ipex_llm.transformers import AutoModelForCausalLM +# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model +from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ LoraConfig -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0") world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1") diff --git a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py index 65660115..647cf9e9 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py @@ -52,11 +52,11 @@ sys.path.append(common_util_path) from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_data from transformers import BitsAndBytesConfig -from bigdl.llm.transformers import AutoModelForCausalLM -# import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model -from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ +from ipex_llm.transformers import AutoModelForCausalLM +# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model +from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ LoraConfig -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0") world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1") diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py index 41f7dc88..3ffd6727 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py @@ -52,11 +52,11 @@ sys.path.append(common_util_path) from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_data from transformers import BitsAndBytesConfig -from bigdl.llm.transformers import AutoModelForCausalLM -# import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model -from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ +from ipex_llm.transformers import AutoModelForCausalLM +# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model +from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ LoraConfig -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0") world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1") diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/save_low_bit_70b_model.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/save_low_bit_70b_model.py index b3b044fa..63d4e8b4 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/save_low_bit_70b_model.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/save_low_bit_70b_model.py @@ -15,7 +15,7 @@ # from transformers import LlamaTokenizer -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM import torch import argparse diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py index 2435e797..b8a1fb8e 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py @@ -21,8 +21,8 @@ import transformers from transformers import LlamaTokenizer from peft import LoraConfig from transformers import BitsAndBytesConfig -from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training +from ipex_llm.transformers import AutoModelForCausalLM from datasets import load_dataset import argparse diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/qlora_finetuning.py index db1f0656..dc78ae33 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/qlora_finetuning.py @@ -21,8 +21,8 @@ import transformers from transformers import LlamaTokenizer from peft import LoraConfig from transformers import BitsAndBytesConfig -from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training +from ipex_llm.transformers import AutoModelForCausalLM from datasets import load_dataset from trl import SFTTrainer import argparse diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py index 7fd2b5fd..2a2ff947 100644 --- a/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py @@ -52,12 +52,12 @@ sys.path.append(common_util_path) from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_data from transformers import BitsAndBytesConfig -from bigdl.llm.transformers import AutoModelForCausalLM -from bigdl.llm.transformers.relora import ReLoRATrainer -# import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model -from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ +from ipex_llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers.relora import ReLoRATrainer +# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model +from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ LoraConfig -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0") world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1") diff --git a/python/llm/example/GPU/LLM-Finetuning/common/utils/prompter.py b/python/llm/example/GPU/LLM-Finetuning/common/utils/prompter.py index 835b80bc..f8ae79cb 100644 --- a/python/llm/example/GPU/LLM-Finetuning/common/utils/prompter.py +++ b/python/llm/example/GPU/LLM-Finetuning/common/utils/prompter.py @@ -34,7 +34,7 @@ import json import os.path as osp from typing import Union -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError class Prompter(object): diff --git a/python/llm/example/GPU/LLM-Finetuning/common/utils/util.py b/python/llm/example/GPU/LLM-Finetuning/common/utils/util.py index e8bd0a2f..76931f2c 100644 --- a/python/llm/example/GPU/LLM-Finetuning/common/utils/util.py +++ b/python/llm/example/GPU/LLM-Finetuning/common/utils/util.py @@ -141,9 +141,9 @@ def get_train_val_data(data, tokenizer, prompter, train_on_inputs, def merge_adapter(base_model, tokenizer, adapter_path, output_path): """Merge the adapter into the original model and save""" import torch - from bigdl.llm.transformers.qlora import PeftModel, LoraConfig - from bigdl.llm.transformers import AutoModelForCausalLM - from bigdl.llm.transformers.low_bit_linear import get_block_size + from ipex_llm.transformers.qlora import PeftModel, LoraConfig + from ipex_llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers.low_bit_linear import get_block_size import tempfile import shutil diff --git a/python/llm/example/GPU/LangChain/transformer_int4_gpu/chat.py b/python/llm/example/GPU/LangChain/transformer_int4_gpu/chat.py index e5ced801..96df015f 100644 --- a/python/llm/example/GPU/LangChain/transformer_int4_gpu/chat.py +++ b/python/llm/example/GPU/LangChain/transformer_int4_gpu/chat.py @@ -21,7 +21,7 @@ import argparse -from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM +from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM from langchain import PromptTemplate, LLMChain from langchain import HuggingFacePipeline diff --git a/python/llm/example/GPU/LangChain/transformer_int4_gpu/rag.py b/python/llm/example/GPU/LangChain/transformer_int4_gpu/rag.py index 1e46d8d4..a1633dd3 100644 --- a/python/llm/example/GPU/LangChain/transformer_int4_gpu/rag.py +++ b/python/llm/example/GPU/LangChain/transformer_int4_gpu/rag.py @@ -31,8 +31,8 @@ from langchain.text_splitter import CharacterTextSplitter from langchain.chains.question_answering import load_qa_chain from langchain.callbacks.manager import CallbackManager -from bigdl.llm.langchain.llms import TransformersLLM -from bigdl.llm.langchain.embeddings import TransformersEmbeddings +from ipex_llm.langchain.llms import TransformersLLM +from ipex_llm.langchain.embeddings import TransformersEmbeddings text_doc = ''' BigDL seamlessly scales your data analytics & AI applications from laptop to cloud, with the following libraries: diff --git a/python/llm/example/GPU/LlamaIndex/rag.py b/python/llm/example/GPU/LlamaIndex/rag.py index 7fb1146e..97dc3ae7 100644 --- a/python/llm/example/GPU/LlamaIndex/rag.py +++ b/python/llm/example/GPU/LlamaIndex/rag.py @@ -163,7 +163,7 @@ def main(args): embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path) # Use custom LLM in BigDL - from bigdl.llm.llamaindex.llms import BigdlLLM + from ipex_llm.llamaindex.llms import BigdlLLM llm = BigdlLLM( model_name=args.model_path, tokenizer_name=args.model_path, diff --git a/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py b/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py index f3f1414e..8c7070ba 100644 --- a/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py +++ b/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py @@ -17,7 +17,7 @@ import torch import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from modelscope import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/ModelScope-Models/generate.py b/python/llm/example/GPU/ModelScope-Models/generate.py index 48bce466..b4fd6637 100644 --- a/python/llm/example/GPU/ModelScope-Models/generate.py +++ b/python/llm/example/GPU/ModelScope-Models/generate.py @@ -19,7 +19,7 @@ import time import argparse import numpy as np -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from modelscope import AutoTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py index 4247fdcc..cc6c3c48 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py @@ -20,7 +20,7 @@ import intel_extension_for_pytorch as ipex import time import argparse -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py index 5733e2a3..98491948 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/BAAI/AquilaChat2-7B/tree/main/predict.py diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py index 108ccb00..52f8adf0 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model BAICHUAN_PROMPT_FORMAT = "{prompt} " diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py index 15d3db8a..215370b4 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227 # and https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/generation_utils.py#L7-L49 diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py index c7c3a3f0..1e830107 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py @@ -20,7 +20,7 @@ import scipy import time import argparse -from bigdl.llm import optimize_model +from ipex_llm import optimize_model from transformers import AutoProcessor, BarkModel diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py index d62eea25..ac6e0842 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model BLUELM_PROMPT_FORMAT = "{prompt} " diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py index bd7ac596..71b6ceea 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModel, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L1007 diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py index 78bf75d9..1e860e80 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py @@ -20,7 +20,7 @@ import argparse import numpy as np from transformers import AutoModel, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model if __name__ == '__main__': diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py index a6432ef3..1568e085 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModel, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://github.com/THUDM/ChatGLM3/blob/main/PROMPT.md diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py index 569f6ec7..20f8b33c 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py @@ -20,7 +20,7 @@ import argparse import numpy as np from transformers import AutoModel, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model if __name__ == '__main__': diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py index c65b3079..9d09c857 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, CodeLlamaTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/docs/transformers/v4.34.1/model_doc/code_llama diff --git a/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py index d2f32ce7..3a4c7e52 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template diff --git a/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py index 70baa8a3..be92fd95 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # Refer to https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct diff --git a/python/llm/example/GPU/PyTorch-Models/Model/distil-whisper/recognize.py b/python/llm/example/GPU/PyTorch-Models/Model/distil-whisper/recognize.py index 2c5326b1..313b1a06 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/distil-whisper/recognize.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/distil-whisper/recognize.py @@ -17,7 +17,7 @@ import time import argparse -from bigdl.llm import optimize_model +from ipex_llm import optimize_model from datasets import load_dataset from transformers import AutoModelForSpeechSeq2Seq, pipeline from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py index 403135de..a084b615 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/databricks/dolly-v1-6b#generate-text diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py index a9f218bd..9445f406 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/databricks/dolly-v2-12b/blob/main/instruct_pipeline.py#L15 diff --git a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py index c9ded902..11eedd25 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForSeq2SeqLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, FLAN_T5_PROMPT_FORMAT = "<|User|>:{prompt}" diff --git a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py index 570edfbe..799bb62c 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py index 81042d06..3fe07715 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llama2/low_memory_generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llama2/low_memory_generate.py index 105e1d5f..55c9e70b 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/llama2/low_memory_generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/llama2/low_memory_generate.py @@ -32,8 +32,8 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationMixin from transformers.modeling_outputs import CausalLMOutputWithPast -from bigdl.llm import optimize_model -from bigdl.llm.transformers.low_bit_linear import FP4Params, LowBitLinear +from ipex_llm import optimize_model +from ipex_llm.transformers.low_bit_linear import FP4Params, LowBitLinear MAX_LENGTH = 4096 # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py index 0bf3f23d..6a6e5a4a 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py @@ -56,7 +56,7 @@ from llava.mm_utils import ( KeywordsStoppingCriteria ) -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # Load the pretrained model. # Adapted from llava.model.builder.load_pretrained_model. diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py index 2f4c4879..e1b392b2 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py @@ -18,7 +18,7 @@ import argparse import time import torch import intel_extension_for_pytorch as ipex -from bigdl.llm import optimize_model +from ipex_llm import optimize_model from transformers import AutoTokenizer from model import MambaLMHeadModel diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py index d05ee560..459d23e5 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py index 83d74959..ae8f0a97 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1#instruction-format diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py index 743192e2..827b59bd 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, GenerationConfig -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py index a91b64e7..3c629f84 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, GenerationConfig -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/microsoft/phi-2 diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py index 991377fd..8287a37b 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py @@ -21,7 +21,7 @@ import numpy as np from transformers import AutoTokenizer, GenerationConfig import intel_extension_for_pytorch as ipex -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py b/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py index 29adf173..839d7772 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py @@ -21,7 +21,7 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig -from bigdl.llm import optimize_model +from ipex_llm import optimize_model torch.manual_seed(1234) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py index 04b4779d..8b81a4f5 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model import intel_extension_for_pytorch as ipex import numpy as np @@ -39,7 +39,7 @@ if __name__ == '__main__': from transformers import AutoModelForCausalLM - from bigdl.llm import optimize_model + from ipex_llm import optimize_model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype = torch.float16, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py index 57f2c9f6..73eecb01 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, REPLIT_PROMPT_FORMAT = "{prompt}" diff --git a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py index 930a3881..95388061 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py @@ -20,7 +20,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # prompt format is tuned based on the output example in this link: diff --git a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py index 342d6229..9776a039 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py @@ -43,7 +43,7 @@ import torch import time import argparse -from bigdl.llm import optimize_model +from ipex_llm import optimize_model from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech from datasets import load_dataset import soundfile as sf diff --git a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py index a4e0a08a..380d63c3 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, AutoTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, STARCODER_PROMPT_FORMAT = "{prompt}" diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py index d6649004..d08d6087 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py @@ -18,7 +18,7 @@ import torch import time import argparse -from bigdl.llm import optimize_model +from ipex_llm import optimize_model from transformers import AutoModelForCausalLM, AutoTokenizer # you could tune the prompt based on your own model diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py index 9615f943..31179c8c 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py @@ -19,7 +19,7 @@ import sys, os, time import intel_extension_for_pytorch as ipex import argparse from transformers import LlamaTokenizer, AutoModelForCausalLM -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage YUAN2_PROMPT_FORMAT = """ diff --git a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py index ddd85a94..6caec894 100644 --- a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py @@ -19,7 +19,7 @@ import time import argparse from transformers import AutoModelForCausalLM, LlamaTokenizer -from bigdl.llm import optimize_model +from ipex_llm import optimize_model # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style diff --git a/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py b/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py index b19c3571..9a289568 100644 --- a/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py @@ -17,8 +17,8 @@ import torch import time import argparse -from bigdl.llm import optimize_model -from bigdl.llm.optimize import low_memory_init, load_low_bit +from ipex_llm import optimize_model +from ipex_llm.optimize import low_memory_init, load_low_bit from transformers import AutoModelForCausalLM, LlamaTokenizer # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/Speculative-Decoding/baichuan2/speculative.py b/python/llm/example/GPU/Speculative-Decoding/baichuan2/speculative.py index c09d57a1..af970acd 100644 --- a/python/llm/example/GPU/Speculative-Decoding/baichuan2/speculative.py +++ b/python/llm/example/GPU/Speculative-Decoding/baichuan2/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/GPU/Speculative-Decoding/chatglm3/speculative.py b/python/llm/example/GPU/Speculative-Decoding/chatglm3/speculative.py index 2ceb77bd..e754e7c7 100644 --- a/python/llm/example/GPU/Speculative-Decoding/chatglm3/speculative.py +++ b/python/llm/example/GPU/Speculative-Decoding/chatglm3/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModel +from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/GPU/Speculative-Decoding/gpt-j/speculative.py b/python/llm/example/GPU/Speculative-Decoding/gpt-j/speculative.py index 46ee8399..4ea31b1b 100644 --- a/python/llm/example/GPU/Speculative-Decoding/gpt-j/speculative.py +++ b/python/llm/example/GPU/Speculative-Decoding/gpt-j/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/GPU/Speculative-Decoding/llama2/speculative.py b/python/llm/example/GPU/Speculative-Decoding/llama2/speculative.py index 443e55c9..7a102ab5 100644 --- a/python/llm/example/GPU/Speculative-Decoding/llama2/speculative.py +++ b/python/llm/example/GPU/Speculative-Decoding/llama2/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer import argparse import time diff --git a/python/llm/example/GPU/Speculative-Decoding/mistral/speculative.py b/python/llm/example/GPU/Speculative-Decoding/mistral/speculative.py index 9fad5d94..1f6763a3 100644 --- a/python/llm/example/GPU/Speculative-Decoding/mistral/speculative.py +++ b/python/llm/example/GPU/Speculative-Decoding/mistral/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/GPU/Speculative-Decoding/qwen/speculative.py b/python/llm/example/GPU/Speculative-Decoding/qwen/speculative.py index d5ac109d..d1761af2 100644 --- a/python/llm/example/GPU/Speculative-Decoding/qwen/speculative.py +++ b/python/llm/example/GPU/Speculative-Decoding/qwen/speculative.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM +from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer import argparse import time diff --git a/python/llm/example/GPU/vLLM-Serving/README.md b/python/llm/example/GPU/vLLM-Serving/README.md index e09206b1..595b0d84 100644 --- a/python/llm/example/GPU/vLLM-Serving/README.md +++ b/python/llm/example/GPU/vLLM-Serving/README.md @@ -73,7 +73,7 @@ To fully utilize the continuous batching feature of the `vLLM`, you can send req #!/bin/bash # You may also want to adjust the `--max-num-batched-tokens` argument, it indicates the hard limit # of batched prompt length the server will accept -python -m bigdl.llm.vllm.entrypoints.openai.api_server \ +python -m ipex_llm.vllm.entrypoints.openai.api_server \ --model /MODEL_PATH/Llama-2-7b-chat-hf/ --port 8000 \ --load-format 'auto' --device xpu --dtype bfloat16 \ --load-in-low-bit sym_int4 \ diff --git a/python/llm/example/GPU/vLLM-Serving/offline_inference.py b/python/llm/example/GPU/vLLM-Serving/offline_inference.py index f74dbcd0..327cfb55 100644 --- a/python/llm/example/GPU/vLLM-Serving/offline_inference.py +++ b/python/llm/example/GPU/vLLM-Serving/offline_inference.py @@ -31,8 +31,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from bigdl.llm.vllm.entrypoints.llm import LLM -from bigdl.llm.vllm.sampling_params import SamplingParams +from ipex_llm.vllm.entrypoints.llm import LLM +from ipex_llm.vllm.sampling_params import SamplingParams # Sample prompts. prompts = [ diff --git a/python/llm/portable-zip/chat-ui.bat b/python/llm/portable-zip/chat-ui.bat index b4748a3d..cb43dc66 100644 --- a/python/llm/portable-zip/chat-ui.bat +++ b/python/llm/portable-zip/chat-ui.bat @@ -11,7 +11,7 @@ if errorlevel 1 ( ) echo [1/3] Controller started successfully -powershell -Command "Start-Process -FilePath PowerShell -ArgumentList '-Command', '& { .\python-embed\python.exe -m bigdl.llm.serving.model_worker --model-path %modelpath% --device cpu > zip_model_worker.log 2>&1 }' -NoNewWindow" +powershell -Command "Start-Process -FilePath PowerShell -ArgumentList '-Command', '& { .\python-embed\python.exe -m ipex_llm.serving.model_worker --model-path %modelpath% --device cpu > zip_model_worker.log 2>&1 }' -NoNewWindow" timeout /t 1 /nobreak >nul 2>&1 :loop2 powershell -Command "$output = Get-Content zip_model_worker.log; if($null -eq $output -or !($output | Select-String -Pattern 'Uvicorn running on')) { exit 1 } else { exit 0 }" diff --git a/python/llm/portable-zip/chat.py b/python/llm/portable-zip/chat.py index 18bd319f..439bdd31 100644 --- a/python/llm/portable-zip/chat.py +++ b/python/llm/portable-zip/chat.py @@ -51,7 +51,7 @@ from transformers.generation.stopping_criteria import StoppingCriteriaList from colorama import Fore -from bigdl.llm import optimize_model +from ipex_llm import optimize_model from kv_cache import StartRecentKVCache HUMAN_ID = "" diff --git a/python/llm/setup.py b/python/llm/setup.py index b1dcca9f..69a5c39b 100644 --- a/python/llm/setup.py +++ b/python/llm/setup.py @@ -311,11 +311,11 @@ def setup_package(): packages=get_llm_packages(), package_dir={"": "src"}, package_data={ - "bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"] + ["transformers/gguf/models/model_implement/*/*.json"]}, + "ipex_llm": package_data[platform_name] + ["cli/prompts/*.txt"] + ["transformers/gguf/models/model_implement/*/*.json"]}, include_package_data=True, entry_points={ "console_scripts": [ - 'llm-convert=bigdl.llm.convert_model:main' + 'llm-convert=ipex_llm.convert_model:main' ] }, extras_require={"all": all_requires, diff --git a/python/llm/src/bigdl/__init__.py b/python/llm/src/bigdl/__init__.py deleted file mode 100644 index 30646857..00000000 --- a/python/llm/src/bigdl/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This would makes sure Python is aware there is more than one sub-package within bigdl, -# physically located elsewhere. -# Otherwise there would be module not found error in non-pip's setting as Python would -# only search the first bigdl package and end up finding only one sub-package. -import pkgutil -__path__ = pkgutil.extend_path(__path__, __name__) # type: ignore diff --git a/python/llm/src/bigdl/llm/__init__.py b/python/llm/src/ipex_llm/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/__init__.py rename to python/llm/src/ipex_llm/__init__.py diff --git a/python/llm/src/bigdl/llm/cli/llm-chat b/python/llm/src/ipex_llm/cli/llm-chat similarity index 96% rename from python/llm/src/bigdl/llm/cli/llm-chat rename to python/llm/src/ipex_llm/cli/llm-chat index 0b84171d..6f480fae 100755 --- a/python/llm/src/bigdl/llm/cli/llm-chat +++ b/python/llm/src/ipex_llm/cli/llm-chat @@ -8,7 +8,7 @@ n_predict=512 EXTRA_ARGS=('--color') -llm_dir="$(dirname "$(python -c "import bigdl.llm;print(bigdl.llm.__file__)")")" +llm_dir="$(dirname "$(python -c "import ipex_llm;print(ipex_llm.__file__)")")" lib_dir="$llm_dir/libs" prompts_dir="$llm_dir/cli/prompts" diff --git a/python/llm/src/bigdl/llm/cli/llm-chat.ps1 b/python/llm/src/ipex_llm/cli/llm-chat.ps1 similarity index 92% rename from python/llm/src/bigdl/llm/cli/llm-chat.ps1 rename to python/llm/src/ipex_llm/cli/llm-chat.ps1 index b3ff0403..6e2a8747 100644 --- a/python/llm/src/bigdl/llm/cli/llm-chat.ps1 +++ b/python/llm/src/ipex_llm/cli/llm-chat.ps1 @@ -1,8 +1,8 @@ -$llm_dir = (Split-Path -Parent (python -c "import bigdl.llm;print(bigdl.llm.__file__)")) +$llm_dir = (Split-Path -Parent (python -c "import ipex_llm;print(ipex_llm.__file__)")) $lib_dir = Join-Path $llm_dir "libs" $prompt_dir = Join-Path $llm_dir "cli/prompts" -$vnni_enable = ((python -c "from bigdl.llm.utils.isa_checker import check_avx_vnni;print(check_avx_vnni())").ToLower() -eq "true") +$vnni_enable = ((python -c "from ipex_llm.utils.isa_checker import check_avx_vnni;print(check_avx_vnni())").ToLower() -eq "true") $model_family = "" $threads = 8 # Number of tokens to predict (made it larger than default because we want a long interaction) diff --git a/python/llm/src/bigdl/llm/cli/llm-cli b/python/llm/src/ipex_llm/cli/llm-cli similarity index 96% rename from python/llm/src/bigdl/llm/cli/llm-cli rename to python/llm/src/ipex_llm/cli/llm-cli index a145c09a..fdb182ea 100755 --- a/python/llm/src/bigdl/llm/cli/llm-cli +++ b/python/llm/src/ipex_llm/cli/llm-cli @@ -6,7 +6,7 @@ threads=8 n_predict=128 -llm_dir="$(dirname "$(python -c "import bigdl.llm;print(bigdl.llm.__file__)")")" +llm_dir="$(dirname "$(python -c "import ipex_llm;print(ipex_llm.__file__)")")" lib_dir="$llm_dir/libs" # Function to display help message diff --git a/python/llm/src/bigdl/llm/cli/llm-cli.ps1 b/python/llm/src/ipex_llm/cli/llm-cli.ps1 similarity index 92% rename from python/llm/src/bigdl/llm/cli/llm-cli.ps1 rename to python/llm/src/ipex_llm/cli/llm-cli.ps1 index c30138c2..bebcb044 100755 --- a/python/llm/src/bigdl/llm/cli/llm-cli.ps1 +++ b/python/llm/src/ipex_llm/cli/llm-cli.ps1 @@ -1,8 +1,8 @@ -$llm_dir = (Split-Path -Parent (python -c "import bigdl.llm;print(bigdl.llm.__file__)")) +$llm_dir = (Split-Path -Parent (python -c "import ipex_llm;print(ipex_llm.__file__)")) $lib_dir = Join-Path $llm_dir "libs" -$vnni_enable = ((python -c "from bigdl.llm.utils.isa_checker import check_avx_vnni;print(check_avx_vnni())").ToLower() -eq "true") +$vnni_enable = ((python -c "from ipex_llm.utils.isa_checker import check_avx_vnni;print(check_avx_vnni())").ToLower() -eq "true") $model_family = "" $threads = 8 $n_predict = 128 diff --git a/python/llm/src/bigdl/llm/cli/prompts/chat-with-llm.txt b/python/llm/src/ipex_llm/cli/prompts/chat-with-llm.txt similarity index 100% rename from python/llm/src/bigdl/llm/cli/prompts/chat-with-llm.txt rename to python/llm/src/ipex_llm/cli/prompts/chat-with-llm.txt diff --git a/python/llm/src/bigdl/llm/convert_model.py b/python/llm/src/ipex_llm/convert_model.py similarity index 96% rename from python/llm/src/bigdl/llm/convert_model.py rename to python/llm/src/ipex_llm/convert_model.py index 026dcae6..8d9ce2b4 100644 --- a/python/llm/src/bigdl/llm/convert_model.py +++ b/python/llm/src/ipex_llm/convert_model.py @@ -15,7 +15,7 @@ # -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError import argparse import os @@ -75,7 +75,7 @@ def llm_convert(model, :return: the path string to the converted lower precision checkpoint. """ if model_format == "pth": - from bigdl.llm.ggml.convert_model import convert_model as ggml_convert_model + from ipex_llm.ggml.convert_model import convert_model as ggml_convert_model _, _used_args = _special_kwarg_check(kwargs=kwargs, check_args=["tmp_path"]) return ggml_convert_model(input_path=model, @@ -85,7 +85,7 @@ def llm_convert(model, **_used_args, ) elif model_format == "gptq": - from bigdl.llm.gptq.convert.convert_gptq_to_ggml import convert_gptq2ggml + from ipex_llm.gptq.convert.convert_gptq_to_ggml import convert_gptq2ggml invalidInputError(model_family == "llama" and outtype == 'int4', "Convert GPTQ models should always " "specify `--model-family llama --dtype int4` in the command line.") diff --git a/python/llm/src/bigdl/llm/format.sh b/python/llm/src/ipex_llm/format.sh similarity index 100% rename from python/llm/src/bigdl/llm/format.sh rename to python/llm/src/ipex_llm/format.sh diff --git a/python/llm/src/bigdl/llm/ggml/__init__.py b/python/llm/src/ipex_llm/ggml/__init__.py similarity index 85% rename from python/llm/src/bigdl/llm/ggml/__init__.py rename to python/llm/src/ipex_llm/ggml/__init__.py index 21ab9d44..6df2d794 100644 --- a/python/llm/src/bigdl/llm/ggml/__init__.py +++ b/python/llm/src/ipex_llm/ggml/__init__.py @@ -19,13 +19,13 @@ # Otherwise there would be module not found error in non-pip's setting as Python would # only search the first bigdl package and end up finding only one sub-package. -from bigdl.llm.utils.common import LazyImport +from ipex_llm.utils.common import LazyImport import os -convert_model = LazyImport('bigdl.llm.ggml.convert_model.convert_model') +convert_model = LazyImport('ipex_llm.ggml.convert_model.convert_model') # Default is false, set to true to auto importing glibc_checker. BIGDL_GLIBC_CHECK = os.getenv("BIGDL_GLIBC_CHECK", 'False').lower() in ('true', '1', 't') if BIGDL_GLIBC_CHECK: - from bigdl.llm.utils.glibc_checker import check_glibc_version + from ipex_llm.utils.glibc_checker import check_glibc_version check_glibc_version() diff --git a/python/llm/src/bigdl/llm/ggml/convert.py b/python/llm/src/ipex_llm/ggml/convert.py similarity index 98% rename from python/llm/src/bigdl/llm/ggml/convert.py rename to python/llm/src/ipex_llm/ggml/convert.py index 0ab511ba..cb4f6efb 100644 --- a/python/llm/src/bigdl/llm/ggml/convert.py +++ b/python/llm/src/ipex_llm/ggml/convert.py @@ -40,8 +40,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.utils.convert_util import * +from ipex_llm.utils.common import invalidInputError +from ipex_llm.utils.convert_util import * from pathlib import Path import os diff --git a/python/llm/src/bigdl/llm/ggml/convert_model.py b/python/llm/src/ipex_llm/ggml/convert_model.py similarity index 97% rename from python/llm/src/bigdl/llm/ggml/convert_model.py rename to python/llm/src/ipex_llm/ggml/convert_model.py index 5b76dca7..074a7d35 100644 --- a/python/llm/src/bigdl/llm/ggml/convert_model.py +++ b/python/llm/src/ipex_llm/ggml/convert_model.py @@ -16,9 +16,9 @@ import os import time from pathlib import Path -from bigdl.llm.ggml.convert import _convert_to_ggml, _convert_chatglm -from bigdl.llm.ggml.quantize import quantize -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.ggml.convert import _convert_to_ggml, _convert_chatglm +from ipex_llm.ggml.quantize import quantize +from ipex_llm.utils.common import invalidInputError import argparse import tempfile diff --git a/python/llm/src/bigdl/llm/ggml/model/__init__.py b/python/llm/src/ipex_llm/ggml/model/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/ggml/model/__init__.py rename to python/llm/src/ipex_llm/ggml/model/__init__.py diff --git a/python/llm/src/bigdl/llm/ggml/model/bloom/__init__.py b/python/llm/src/ipex_llm/ggml/model/bloom/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/ggml/model/bloom/__init__.py rename to python/llm/src/ipex_llm/ggml/model/bloom/__init__.py diff --git a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py b/python/llm/src/ipex_llm/ggml/model/bloom/bloom.py similarity index 99% rename from python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py rename to python/llm/src/ipex_llm/ggml/model/bloom/bloom.py index 349cec3a..6299e203 100644 --- a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py +++ b/python/llm/src/ipex_llm/ggml/model/bloom/bloom.py @@ -47,8 +47,8 @@ from .bloom_cpp import bloom_load, bloom_free, bloom_run from .bloom_cpp import bloom_tokenize, bloom_detokenize, bloom_forward, bloom_eval, bloom_embed -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.ggml.model.generation import GenerationMixin +from ipex_llm.utils.common import invalidInputError +from ipex_llm.ggml.model.generation import GenerationMixin from typing import List, Optional, Generator, Sequence, Union import time import uuid diff --git a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom_cpp.py b/python/llm/src/ipex_llm/ggml/model/bloom/bloom_cpp.py similarity index 98% rename from python/llm/src/bigdl/llm/ggml/model/bloom/bloom_cpp.py rename to python/llm/src/ipex_llm/ggml/model/bloom/bloom_cpp.py index 912fa729..a7178a8e 100644 --- a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom_cpp.py +++ b/python/llm/src/ipex_llm/ggml/model/bloom/bloom_cpp.py @@ -64,8 +64,8 @@ from ctypes import ( c_size_t, ) import pathlib -from bigdl.llm.utils.utils import get_shared_lib_info -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.utils import get_shared_lib_info +from ipex_llm.utils.common import invalidInputError # Load the library diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py b/python/llm/src/ipex_llm/ggml/model/chatglm/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py rename to python/llm/src/ipex_llm/ggml/model/chatglm/__init__.py diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm.py similarity index 99% rename from python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py rename to python/llm/src/ipex_llm/ggml/model/chatglm/chatglm.py index 66341b0e..cd1efba6 100644 --- a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py +++ b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm.py @@ -48,8 +48,8 @@ from .chatglm_cpp import chatglm_load, chatglm_tokenize, chatglm_detokenize, \ chatglm_forward, chatglm_eos_token -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.ggml.model.generation import GenerationMixin +from ipex_llm.utils.common import invalidInputError +from ipex_llm.ggml.model.generation import GenerationMixin from typing import List, Optional, Generator, Sequence, Union import time import uuid diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm_cpp.py similarity index 97% rename from python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py rename to python/llm/src/ipex_llm/ggml/model/chatglm/chatglm_cpp.py index 47513e54..ce136cdb 100644 --- a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py +++ b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm_cpp.py @@ -23,7 +23,7 @@ from typing import List from pathlib import Path -from bigdl.llm.libs.chatglm_C import Pipeline, GenerationConfig +from ipex_llm.libs.chatglm_C import Pipeline, GenerationConfig class ChatGLMContext: diff --git a/python/llm/src/bigdl/llm/ggml/model/generation/__init__.py b/python/llm/src/ipex_llm/ggml/model/generation/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/ggml/model/generation/__init__.py rename to python/llm/src/ipex_llm/ggml/model/generation/__init__.py diff --git a/python/llm/src/bigdl/llm/ggml/model/generation/utils.py b/python/llm/src/ipex_llm/ggml/model/generation/utils.py similarity index 99% rename from python/llm/src/bigdl/llm/ggml/model/generation/utils.py rename to python/llm/src/ipex_llm/ggml/model/generation/utils.py index 5e61bdfc..710c729d 100644 --- a/python/llm/src/bigdl/llm/ggml/model/generation/utils.py +++ b/python/llm/src/ipex_llm/ggml/model/generation/utils.py @@ -21,7 +21,7 @@ from typing import Optional, Union, Sequence, List -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError import torch diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/__init__.py b/python/llm/src/ipex_llm/ggml/model/gptneox/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/ggml/model/gptneox/__init__.py rename to python/llm/src/ipex_llm/ggml/model/gptneox/__init__.py diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py b/python/llm/src/ipex_llm/ggml/model/gptneox/gptneox.py similarity index 99% rename from python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py rename to python/llm/src/ipex_llm/ggml/model/gptneox/gptneox.py index 6d07e216..41b62413 100644 --- a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py +++ b/python/llm/src/ipex_llm/ggml/model/gptneox/gptneox.py @@ -54,8 +54,8 @@ import multiprocessing import ctypes from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple from collections import deque, OrderedDict -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.ggml.model.generation import GenerationMixin +from ipex_llm.utils.common import invalidInputError +from ipex_llm.ggml.model.generation import GenerationMixin from . import gptneox_cpp from .gptneox_types import * diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_cpp.py b/python/llm/src/ipex_llm/ggml/model/gptneox/gptneox_cpp.py similarity index 99% rename from python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_cpp.py rename to python/llm/src/ipex_llm/ggml/model/gptneox/gptneox_cpp.py index edc5cc81..1de5ee4e 100644 --- a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_cpp.py +++ b/python/llm/src/ipex_llm/ggml/model/gptneox/gptneox_cpp.py @@ -63,8 +63,8 @@ from ctypes import ( c_size_t, ) import pathlib -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.utils.utils import get_shared_lib_info +from ipex_llm.utils.common import invalidInputError +from ipex_llm.utils.utils import get_shared_lib_info # Load the library diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_types.py b/python/llm/src/ipex_llm/ggml/model/gptneox/gptneox_types.py similarity index 100% rename from python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_types.py rename to python/llm/src/ipex_llm/ggml/model/gptneox/gptneox_types.py diff --git a/python/llm/src/bigdl/llm/ggml/model/llama/__init__.py b/python/llm/src/ipex_llm/ggml/model/llama/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/ggml/model/llama/__init__.py rename to python/llm/src/ipex_llm/ggml/model/llama/__init__.py diff --git a/python/llm/src/bigdl/llm/ggml/model/llama/llama.py b/python/llm/src/ipex_llm/ggml/model/llama/llama.py similarity index 99% rename from python/llm/src/bigdl/llm/ggml/model/llama/llama.py rename to python/llm/src/ipex_llm/ggml/model/llama/llama.py index 9319d390..7d34957d 100644 --- a/python/llm/src/bigdl/llm/ggml/model/llama/llama.py +++ b/python/llm/src/ipex_llm/ggml/model/llama/llama.py @@ -54,8 +54,8 @@ import math import multiprocessing from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple from collections import deque, OrderedDict -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.ggml.model.generation import GenerationMixin +from ipex_llm.utils.common import invalidInputError +from ipex_llm.ggml.model.generation import GenerationMixin from . import llama_cpp from .llama_types import * diff --git a/python/llm/src/bigdl/llm/ggml/model/llama/llama_cpp.py b/python/llm/src/ipex_llm/ggml/model/llama/llama_cpp.py similarity index 99% rename from python/llm/src/bigdl/llm/ggml/model/llama/llama_cpp.py rename to python/llm/src/ipex_llm/ggml/model/llama/llama_cpp.py index b7b5d2ed..bea2fef5 100644 --- a/python/llm/src/bigdl/llm/ggml/model/llama/llama_cpp.py +++ b/python/llm/src/ipex_llm/ggml/model/llama/llama_cpp.py @@ -63,8 +63,8 @@ from ctypes import ( c_size_t, ) import pathlib -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.utils.utils import get_shared_lib_info +from ipex_llm.utils.common import invalidInputError +from ipex_llm.utils.utils import get_shared_lib_info # Load the library diff --git a/python/llm/src/bigdl/llm/ggml/model/llama/llama_types.py b/python/llm/src/ipex_llm/ggml/model/llama/llama_types.py similarity index 100% rename from python/llm/src/bigdl/llm/ggml/model/llama/llama_types.py rename to python/llm/src/ipex_llm/ggml/model/llama/llama_types.py diff --git a/python/llm/src/bigdl/llm/ggml/model/starcoder/__init__.py b/python/llm/src/ipex_llm/ggml/model/starcoder/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/ggml/model/starcoder/__init__.py rename to python/llm/src/ipex_llm/ggml/model/starcoder/__init__.py diff --git a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py b/python/llm/src/ipex_llm/ggml/model/starcoder/starcoder.py similarity index 99% rename from python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py rename to python/llm/src/ipex_llm/ggml/model/starcoder/starcoder.py index c00935cb..13fb174d 100644 --- a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py +++ b/python/llm/src/ipex_llm/ggml/model/starcoder/starcoder.py @@ -48,8 +48,8 @@ from .starcoder_cpp import starcoder_load, starcoder_free, starcoder_run from .starcoder_cpp import starcoder_tokenize, starcoder_detokenize from .starcoder_cpp import starcoder_forward, starcoder_eval, starcoder_embed -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.ggml.model.generation import GenerationMixin +from ipex_llm.utils.common import invalidInputError +from ipex_llm.ggml.model.generation import GenerationMixin from typing import List, Optional, Generator, Sequence, Union import time import uuid diff --git a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder_cpp.py b/python/llm/src/ipex_llm/ggml/model/starcoder/starcoder_cpp.py similarity index 98% rename from python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder_cpp.py rename to python/llm/src/ipex_llm/ggml/model/starcoder/starcoder_cpp.py index 2b0d80ee..1346324f 100644 --- a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder_cpp.py +++ b/python/llm/src/ipex_llm/ggml/model/starcoder/starcoder_cpp.py @@ -64,8 +64,8 @@ from ctypes import ( c_size_t, ) import pathlib -from bigdl.llm.utils.utils import get_shared_lib_info -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.utils import get_shared_lib_info +from ipex_llm.utils.common import invalidInputError # Load the library diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/ipex_llm/ggml/quantize.py similarity index 99% rename from python/llm/src/bigdl/llm/ggml/quantize.py rename to python/llm/src/ipex_llm/ggml/quantize.py index 382b15eb..15d36202 100644 --- a/python/llm/src/bigdl/llm/ggml/quantize.py +++ b/python/llm/src/ipex_llm/ggml/quantize.py @@ -16,7 +16,7 @@ import os import subprocess -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError import platform from pathlib import Path diff --git a/python/llm/src/bigdl/llm/gptq/__init__.py b/python/llm/src/ipex_llm/gptq/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/gptq/__init__.py rename to python/llm/src/ipex_llm/gptq/__init__.py diff --git a/python/llm/src/bigdl/llm/gptq/convert/__init__.py b/python/llm/src/ipex_llm/gptq/convert/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/gptq/convert/__init__.py rename to python/llm/src/ipex_llm/gptq/convert/__init__.py diff --git a/python/llm/src/bigdl/llm/gptq/convert/convert_gptq_to_ggml.py b/python/llm/src/ipex_llm/gptq/convert/convert_gptq_to_ggml.py similarity index 99% rename from python/llm/src/bigdl/llm/gptq/convert/convert_gptq_to_ggml.py rename to python/llm/src/ipex_llm/gptq/convert/convert_gptq_to_ggml.py index 8ae3a0ba..9cd66d89 100644 --- a/python/llm/src/bigdl/llm/gptq/convert/convert_gptq_to_ggml.py +++ b/python/llm/src/ipex_llm/gptq/convert/convert_gptq_to_ggml.py @@ -29,7 +29,7 @@ import numpy as np import torch from sentencepiece import SentencePieceProcessor from pathlib import Path -from bigdl.llm.utils.common.log4Error import invalidInputError +from ipex_llm.utils.common.log4Error import invalidInputError def write_header(fout, shape, dst_name, ftype_cur): diff --git a/python/llm/src/bigdl/llm/langchain/__init__.py b/python/llm/src/ipex_llm/langchain/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/langchain/__init__.py rename to python/llm/src/ipex_llm/langchain/__init__.py diff --git a/python/llm/src/bigdl/llm/langchain/embeddings/__init__.py b/python/llm/src/ipex_llm/langchain/embeddings/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/langchain/embeddings/__init__.py rename to python/llm/src/ipex_llm/langchain/embeddings/__init__.py diff --git a/python/llm/src/bigdl/llm/langchain/embeddings/bigdlllm.py b/python/llm/src/ipex_llm/langchain/embeddings/bigdlllm.py similarity index 94% rename from python/llm/src/bigdl/llm/langchain/embeddings/bigdlllm.py rename to python/llm/src/ipex_llm/langchain/embeddings/bigdlllm.py index c4686af3..1947039a 100644 --- a/python/llm/src/bigdl/llm/langchain/embeddings/bigdlllm.py +++ b/python/llm/src/ipex_llm/langchain/embeddings/bigdlllm.py @@ -61,7 +61,7 @@ class BigdlNativeEmbeddings(BaseModel, Embeddings): Example: .. code-block:: python - from bigdl.llm.langchain.embeddings import BigdlNativeEmbeddings + from ipex_llm.langchain.embeddings import BigdlNativeEmbeddings llama = BigdlNativeEmbeddings(model_path="/path/to/model.bin") """ @@ -72,10 +72,10 @@ class BigdlNativeEmbeddings(BaseModel, Embeddings): """The model family: currently supports llama, gptneox, bloom and starcoder.""" family_info = { - 'llama': {'module': "bigdl.llm.models", 'class': "Llama"}, - 'bloom': {'module': "bigdl.llm.models", 'class': "Bloom"}, - 'gptneox': {'module': "bigdl.llm.models", 'class': "Gptneox"}, - 'starcoder': {'module':"bigdl.llm.models", 'class': "Starcoder"}, + 'llama': {'module': "ipex_llm.models", 'class': "Llama"}, + 'bloom': {'module': "ipex_llm.models", 'class': "Bloom"}, + 'gptneox': {'module': "ipex_llm.models", 'class': "Gptneox"}, + 'starcoder': {'module':"ipex_llm.models", 'class': "Starcoder"}, } #: :meta private: """Info necessary for different model family initiation and configure.""" @@ -156,7 +156,7 @@ class BigdlNativeEmbeddings(BaseModel, Embeddings): values["client"] = class_(model_path, embedding=True, **model_params) - # from bigdl.llm.ggml.model.llama import Llama + # from ipex_llm.ggml.model.llama import Llama # values["client"] = Llama(model_path, embedding=True, **model_params) @@ -205,14 +205,14 @@ class _BaseEmbeddings(BaseModel, Embeddings): """Wrapper around bigdl-llm embedding models. param model_path: If running with ``native int4``, the path should be converted BigDL-LLM - optimized ggml binary checkpoint, which should be converted by ``bigdl.llm.llm_convert``. + optimized ggml binary checkpoint, which should be converted by ``ipex_llm.llm_convert``. If running with ``transformers int4``, the path should be the huggingface repo id to be downloaded or the huggingface checkpoint folder. Example: .. code-block:: python - from bigdl.llm.langchain.embeddings import LlamaEmbeddings + from ipex_llm.langchain.embeddings import LlamaEmbeddings llama = LlamaEmbeddings(model_path="/path/to/model.bin") """ @@ -313,7 +313,7 @@ class _BaseEmbeddings(BaseModel, Embeddings): values["client"] = TransformersEmbeddings.from_model_id(model_path, model_kwargs, **kwargs) - # from bigdl.llm.ggml.model.llama import Llama + # from ipex_llm.ggml.model.llama import Llama # values["client"] = Llama(model_path, embedding=True, **model_params) @@ -366,19 +366,19 @@ class _BaseEmbeddings(BaseModel, Embeddings): class LlamaEmbeddings(_BaseEmbeddings): ggml_model = "Llama" - ggml_module = "bigdl.llm.models" + ggml_module = "ipex_llm.models" class BloomEmbeddings(_BaseEmbeddings): ggml_model = "Bloom" - ggml_module = "bigdl.llm.models" + ggml_module = "ipex_llm.models" class GptneoxEmbeddings(_BaseEmbeddings): ggml_model = "Gptneox" - ggml_module = "bigdl.llm.models" + ggml_module = "ipex_llm.models" class StarcoderEmbeddings(_BaseEmbeddings): ggml_model = "Starcoder" - ggml_module = "bigdl.llm.models" + ggml_module = "ipex_llm.models" diff --git a/python/llm/src/bigdl/llm/langchain/embeddings/transformersembeddings.py b/python/llm/src/ipex_llm/langchain/embeddings/transformersembeddings.py similarity index 98% rename from python/llm/src/bigdl/llm/langchain/embeddings/transformersembeddings.py rename to python/llm/src/ipex_llm/langchain/embeddings/transformersembeddings.py index 9c69f474..dcc1c733 100644 --- a/python/llm/src/bigdl/llm/langchain/embeddings/transformersembeddings.py +++ b/python/llm/src/ipex_llm/langchain/embeddings/transformersembeddings.py @@ -64,7 +64,7 @@ class TransformersEmbeddings(BaseModel, Embeddings): Example: .. code-block:: python - from bigdl.llm.langchain.embeddings import TransformersEmbeddings + from ipex_llm.langchain.embeddings import TransformersEmbeddings embeddings = TransformersEmbeddings.from_model_id(model_id) """ @@ -101,7 +101,7 @@ class TransformersEmbeddings(BaseModel, Embeddings): An object of TransformersEmbeddings. """ try: - from bigdl.llm.transformers import AutoModel + from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer, LlamaTokenizer except ImportError: diff --git a/python/llm/src/bigdl/llm/langchain/llms/__init__.py b/python/llm/src/ipex_llm/langchain/llms/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/langchain/llms/__init__.py rename to python/llm/src/ipex_llm/langchain/llms/__init__.py diff --git a/python/llm/src/bigdl/llm/langchain/llms/bigdlllm.py b/python/llm/src/ipex_llm/langchain/llms/bigdlllm.py similarity index 96% rename from python/llm/src/bigdl/llm/langchain/llms/bigdlllm.py rename to python/llm/src/ipex_llm/langchain/llms/bigdlllm.py index 3a606a06..afd51780 100644 --- a/python/llm/src/bigdl/llm/langchain/llms/bigdlllm.py +++ b/python/llm/src/ipex_llm/langchain/llms/bigdlllm.py @@ -61,7 +61,7 @@ class BigdlNativeLLM(LLM): Example: .. code-block:: python - from bigdl.llm.langchain.llms import BigdlNativeLLM + from ipex_llm.langchain.llms import BigdlNativeLLM llm = BigdlNativeLLM(model_path="/path/to/llama/model") """ @@ -73,11 +73,11 @@ class BigdlNativeLLM(LLM): """The model family: currently supports llama, gptneox, bloom, starcoder and chatglm.""" family_info = { - 'llama': {'module': "bigdl.llm.models" , 'class': "Llama"}, - 'bloom': {'module': "bigdl.llm.models", 'class': "Bloom"}, - 'gptneox': {'module': "bigdl.llm.models", 'class': "Gptneox"}, - 'starcoder': {'module':"bigdl.llm.models", 'class': "Starcoder"}, - 'chatglm': {'module':"bigdl.llm.ggml.model.chatglm", 'class': "ChatGLM"}, + 'llama': {'module': "ipex_llm.models" , 'class': "Llama"}, + 'bloom': {'module': "ipex_llm.models", 'class': "Bloom"}, + 'gptneox': {'module': "ipex_llm.models", 'class': "Gptneox"}, + 'starcoder': {'module':"ipex_llm.models", 'class': "Starcoder"}, + 'chatglm': {'module':"ipex_llm.ggml.model.chatglm", 'class': "ChatGLM"}, } #: :meta private: """Info necessary for different model families initiation and configure.""" @@ -286,7 +286,7 @@ class BigdlNativeLLM(LLM): Example: .. code-block:: python - from bigdl.llm.langchain.llms import BigdlNativeLLM + from ipex_llm.langchain.llms import BigdlNativeLLM llm = BigdlNativeLLM(model_path="/path/to/local/llama/model.bin") llm("This is a prompt.") """ @@ -331,7 +331,7 @@ class BigdlNativeLLM(LLM): Example: .. code-block:: python - from bigdl.llm.langchain.llms import BigdlNativeLLM + from ipex_llm.langchain.llms import BigdlNativeLLM llm = BigdlNativeLLM( model_path="/path/to/local/model.bin", temperature = 0.5 @@ -364,7 +364,7 @@ class _BaseCausalLM(LLM): Example: .. code-block:: python - from bigdl.llm.langchain.llms import LlamaLLM + from ipex_llm.langchain.llms import LlamaLLM llm = LlamaLLM(model_path="/path/to/llama/model") """ @@ -588,7 +588,7 @@ class _BaseCausalLM(LLM): Example: .. code-block:: python - from bigdl.llm.langchain.llms import LlamaLLM + from ipex_llm.langchain.llms import LlamaLLM llm = LlamaLLM(model_path="/path/to/local/llama/model.bin") llm("This is a prompt.") """ @@ -636,7 +636,7 @@ class _BaseCausalLM(LLM): Example: .. code-block:: python - from bigdl.llm.langchain.llms import LlamaLLM + from ipex_llm.langchain.llms import LlamaLLM llm = LlamaLLM( model_path="/path/to/local/model.bin", temperature = 0.5 @@ -675,24 +675,24 @@ class _BaseCausalLM(LLM): class LlamaLLM(_BaseCausalLM): ggml_model = "Llama" - ggml_module = "bigdl.llm.models" + ggml_module = "ipex_llm.models" class BloomLLM(_BaseCausalLM): ggml_model = "Bloom" - ggml_module = "bigdl.llm.models" + ggml_module = "ipex_llm.models" class GptneoxLLM(_BaseCausalLM): ggml_model = "Gptneox" - ggml_module = "bigdl.llm.models" + ggml_module = "ipex_llm.models" class ChatGLMLLM(_BaseCausalLM): ggml_model = "ChatGLM" - ggml_module = "bigdl.llm.ggml.model.chatglm" + ggml_module = "ipex_llm.ggml.model.chatglm" class StarcoderLLM(_BaseCausalLM): ggml_model = "Starcoder" - ggml_module = "bigdl.llm.models" + ggml_module = "ipex_llm.models" diff --git a/python/llm/src/bigdl/llm/langchain/llms/transformersllm.py b/python/llm/src/ipex_llm/langchain/llms/transformersllm.py similarity index 98% rename from python/llm/src/bigdl/llm/langchain/llms/transformersllm.py rename to python/llm/src/ipex_llm/langchain/llms/transformersllm.py index 85eebf78..f3498e9d 100644 --- a/python/llm/src/bigdl/llm/langchain/llms/transformersllm.py +++ b/python/llm/src/ipex_llm/langchain/llms/transformersllm.py @@ -64,7 +64,7 @@ class TransformersLLM(LLM): Example: .. code-block:: python - from bigdl.llm.langchain.llms import TransformersLLM + from ipex_llm.langchain.llms import TransformersLLM llm = TransformersLLM.from_model_id(model_id="THUDM/chatglm-6b") """ @@ -106,7 +106,7 @@ class TransformersLLM(LLM): An object of TransformersLLM. """ try: - from bigdl.llm.transformers import ( + from ipex_llm.transformers import ( AutoModel, AutoModelForCausalLM, # AutoModelForSeq2SeqLM, @@ -170,7 +170,7 @@ class TransformersLLM(LLM): An object of TransformersLLM. """ try: - from bigdl.llm.transformers import ( + from ipex_llm.transformers import ( AutoModel, AutoModelForCausalLM, ) diff --git a/python/llm/src/bigdl/llm/langchain/llms/transformerspipelinellm.py b/python/llm/src/ipex_llm/langchain/llms/transformerspipelinellm.py similarity index 98% rename from python/llm/src/bigdl/llm/langchain/llms/transformerspipelinellm.py rename to python/llm/src/ipex_llm/langchain/llms/transformerspipelinellm.py index 0049e841..42bc68ed 100644 --- a/python/llm/src/bigdl/llm/langchain/llms/transformerspipelinellm.py +++ b/python/llm/src/ipex_llm/langchain/llms/transformerspipelinellm.py @@ -66,7 +66,7 @@ class TransformersPipelineLLM(LLM): Example: .. code-block:: python - from bigdl.llm.langchain.llms import TransformersPipelineLLM + from ipex_llm.langchain.llms import TransformersPipelineLLM llm = TransformersPipelineLLM.from_model_id(model_id="decapoda-research/llama-7b-hf") """ @@ -94,7 +94,7 @@ class TransformersPipelineLLM(LLM): ) -> LLM: """Construct the pipeline object from model_id and task.""" try: - from bigdl.llm.transformers import ( + from ipex_llm.transformers import ( AutoModel, AutoModelForCausalLM, # AutoModelForSeq2SeqLM, diff --git a/python/llm/src/bigdl/llm/llamaindex/__init__.py b/python/llm/src/ipex_llm/llamaindex/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/llamaindex/__init__.py rename to python/llm/src/ipex_llm/llamaindex/__init__.py diff --git a/python/llm/src/bigdl/llm/llamaindex/llms/__init__.py b/python/llm/src/ipex_llm/llamaindex/llms/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/llamaindex/llms/__init__.py rename to python/llm/src/ipex_llm/llamaindex/llms/__init__.py diff --git a/python/llm/src/bigdl/llm/llamaindex/llms/bigdlllm.py b/python/llm/src/ipex_llm/llamaindex/llms/bigdlllm.py similarity index 98% rename from python/llm/src/bigdl/llm/llamaindex/llms/bigdlllm.py rename to python/llm/src/ipex_llm/llamaindex/llms/bigdlllm.py index d6f61854..96550f6a 100644 --- a/python/llm/src/bigdl/llm/llamaindex/llms/bigdlllm.py +++ b/python/llm/src/ipex_llm/llamaindex/llms/bigdlllm.py @@ -91,7 +91,7 @@ class BigdlLLM(CustomLLM): Example: .. code-block:: python - from bigdl.llm.llamaindex.llms import BigdlLLM + from ipex_llm.llamaindex.llms import BigdlLLM llm = BigdlLLM(model_path="/path/to/llama/model") """ @@ -234,7 +234,7 @@ class BigdlLLM(CustomLLM): None. """ model_kwargs = model_kwargs or {} - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM if model: self._model = model else: @@ -244,7 +244,7 @@ class BigdlLLM(CustomLLM): trust_remote_code=True, **model_kwargs ) except: - from bigdl.llm.transformers import AutoModel + from ipex_llm.transformers import AutoModel self._model = AutoModel.from_pretrained(model_name, load_in_4bit=True, **model_kwargs) diff --git a/python/llm/src/bigdl/llm/llm_patching.py b/python/llm/src/ipex_llm/llm_patching.py similarity index 92% rename from python/llm/src/bigdl/llm/llm_patching.py rename to python/llm/src/ipex_llm/llm_patching.py index d84ca4f1..8c0a94e5 100644 --- a/python/llm/src/bigdl/llm/llm_patching.py +++ b/python/llm/src/ipex_llm/llm_patching.py @@ -17,7 +17,7 @@ import transformers import importlib import sys -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError from enum import Enum bigdl_patched = None # None or 'Train' or 'Inference' @@ -43,7 +43,7 @@ def llm_patch(train=False): # Initial version of patch for llm finetuning, inference support TBD if train: - from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel + from ipex_llm.transformers import AutoModelForCausalLM, AutoModel replace_attr(transformers, "AutoModelForCausalLM", AutoModelForCausalLM) replace_attr(transformers, "LlamaForCausalLM", AutoModelForCausalLM) replace_attr(transformers, "AutoModel", AutoModel) @@ -53,7 +53,7 @@ def llm_patch(train=False): invalidInputError(not import_peft_check, 'llm_patch() should be called at the beginning of your code.') import peft - from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ + from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ LoraConfig, TrainingArguments replace_attr(transformers, "TrainingArguments", TrainingArguments) get_peft_model_original = getattr(peft, "get_peft_model") diff --git a/python/llm/src/bigdl/llm/models.py b/python/llm/src/ipex_llm/models.py similarity index 79% rename from python/llm/src/bigdl/llm/models.py rename to python/llm/src/ipex_llm/models.py index 0a1a2123..9157af42 100644 --- a/python/llm/src/bigdl/llm/models.py +++ b/python/llm/src/ipex_llm/models.py @@ -19,9 +19,9 @@ # Otherwise there would be module not found error in non-pip's setting as Python would # only search the first bigdl package and end up finding only one sub-package. -from bigdl.llm.ggml.model.llama import Llama -from bigdl.llm.ggml.model.gptneox import Gptneox -from bigdl.llm.ggml.model.bloom import Bloom -from bigdl.llm.ggml.model.starcoder import Starcoder +from ipex_llm.ggml.model.llama import Llama +from ipex_llm.ggml.model.gptneox import Gptneox +from ipex_llm.ggml.model.bloom import Bloom +from ipex_llm.ggml.model.starcoder import Starcoder # temporarily disable until linux binary file for chatglm ready -# from bigdl.llm.ggml.model.chatglm import ChatGLM +# from ipex_llm.ggml.model.chatglm import ChatGLM diff --git a/python/llm/src/bigdl/llm/optimize.py b/python/llm/src/ipex_llm/optimize.py similarity index 96% rename from python/llm/src/bigdl/llm/optimize.py rename to python/llm/src/ipex_llm/optimize.py index 75b1760b..ee1afc4d 100644 --- a/python/llm/src/bigdl/llm/optimize.py +++ b/python/llm/src/ipex_llm/optimize.py @@ -22,9 +22,9 @@ from torch.nn.modules import Module from torch.nn.modules.module import _IncompatibleKeys from accelerate import init_empty_weights from accelerate.utils import set_module_tensor_to_device -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.utils import extract_local_archive_file, get_local_shard_files +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.utils import extract_local_archive_file, get_local_shard_files import transformers import warnings from transformers import PreTrainedModel @@ -145,7 +145,7 @@ def load_low_bit(model, model_path): >>> # Example 1: >>> # Take ChatGLM2-6B model as an example >>> # Make sure you have saved the optimized model by calling 'save_low_bit' - >>> from bigdl.llm.optimize import low_memory_init, load_low_bit + >>> from ipex_llm.optimize import low_memory_init, load_low_bit >>> with low_memory_init(): # Fast and low cost by loading model on meta device >>> model = AutoModel.from_pretrained(saved_dir, >>> torch_dtype="auto", @@ -157,7 +157,7 @@ def load_low_bit(model, model_path): >>> # alternatively, you can obtain the model instance through traditional loading method. >>> # Take OpenAI Whisper model as an example >>> # Make sure you have saved the optimized model by calling 'save_low_bit' - >>> from bigdl.llm.optimize import load_low_bit + >>> from ipex_llm.optimize import load_low_bit >>> model = whisper.load_model('tiny') # A model instance through traditional loading method >>> model = load_low_bit(model, saved_dir) # Load the optimized model """ @@ -216,7 +216,7 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_ :return: The optimized model. >>> # Take OpenAI Whisper model as an example - >>> from bigdl.llm import optimize_model + >>> from ipex_llm import optimize_model >>> model = whisper.load_model('tiny') # Load whisper model under pytorch framework >>> model = optimize_model(model) # With only one line code change >>> # Use the optimized model without other API change diff --git a/python/llm/src/bigdl/llm/serving/__init__.py b/python/llm/src/ipex_llm/serving/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/serving/__init__.py rename to python/llm/src/ipex_llm/serving/__init__.py diff --git a/python/llm/src/bigdl/llm/serving/fastchat/README.md b/python/llm/src/ipex_llm/serving/fastchat/README.md similarity index 87% rename from python/llm/src/bigdl/llm/serving/fastchat/README.md rename to python/llm/src/ipex_llm/serving/fastchat/README.md index c78b1179..20c4893a 100644 --- a/python/llm/src/bigdl/llm/serving/fastchat/README.md +++ b/python/llm/src/ipex_llm/serving/fastchat/README.md @@ -71,10 +71,10 @@ Then we can run model workers ```bash # On CPU -python3 -m bigdl.llm.serving.fastchat.model_worker --model-path PATH/TO/bigdl-7b --device cpu +python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/bigdl-7b --device cpu # On GPU -python3 -m bigdl.llm.serving.fastchat.model_worker --model-path PATH/TO/bigdl-7b --device xpu +python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/bigdl-7b --device xpu ``` If you run successfully using `BigDL` backend, you can see the output in log like this: @@ -94,14 +94,14 @@ To run the `bigdl_worker` on CPU, using the following code: source bigdl-llm-init -t # Available low_bit format including sym_int4, sym_int8, bf16 etc. -python3 -m bigdl.llm.serving.fastchat.bigdl_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu" +python3 -m ipex_llm.serving.fastchat.bigdl_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu" ``` For GPU example: ```bash # Available low_bit format including sym_int4, sym_int8, fp16 etc. -python3 -m bigdl.llm.serving.fastcaht.bigdl_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu" +python3 -m ipex_llm.serving.fastcaht.bigdl_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu" ``` For a full list of accepted arguments, you can refer to the main method of the `bigdl_worker.py` @@ -114,10 +114,10 @@ To run using the `vLLM_worker`, we don't need to change model name, just simply ```bash # On CPU -python3 -m bigdl.llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device cpu +python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device cpu # On GPU -python3 -m bigdl.llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu +python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu ``` ### Launch Gradio web server diff --git a/python/llm/src/bigdl/llm/serving/fastchat/__init__.py b/python/llm/src/ipex_llm/serving/fastchat/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/serving/fastchat/__init__.py rename to python/llm/src/ipex_llm/serving/fastchat/__init__.py diff --git a/python/llm/src/bigdl/llm/serving/fastchat/bigdl_llm_model.py b/python/llm/src/ipex_llm/serving/fastchat/bigdl_llm_model.py similarity index 97% rename from python/llm/src/bigdl/llm/serving/fastchat/bigdl_llm_model.py rename to python/llm/src/ipex_llm/serving/fastchat/bigdl_llm_model.py index fafb0a36..ba405239 100644 --- a/python/llm/src/bigdl/llm/serving/fastchat/bigdl_llm_model.py +++ b/python/llm/src/ipex_llm/serving/fastchat/bigdl_llm_model.py @@ -33,7 +33,7 @@ from transformers import AutoTokenizer from typing import Dict, List, Optional import math import psutil -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError is_fastchat_patched = False _mapping_fastchat = None @@ -63,7 +63,7 @@ def load_model_base(self, model_path: str, from_pretrained_kwargs: dict): use_fast=self.use_fast_tokenizer, revision=revision, ) - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained( model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs ) @@ -76,7 +76,7 @@ def load_model_chatglm(self, model_path: str, from_pretrained_kwargs: dict): tokenizer = AutoTokenizer.from_pretrained( model_path, trust_remote_code=True, revision=revision ) - from bigdl.llm.transformers import AutoModel + from ipex_llm.transformers import AutoModel model = AutoModel.from_pretrained( model_path, trust_remote_code=True, load_in_4bit=True, **from_pretrained_kwargs ) @@ -246,7 +246,7 @@ class BigDLLLMAdapter(BaseModelAdapter): model_path, use_fast=False, revision=revision, trust_remote_code=True ) print("Customized bigdl-llm loader") - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained( model_path, load_in_4bit=True, @@ -268,7 +268,7 @@ class BigDLLMLOWBITAdapter(BaseModelAdapter): model_path, use_fast=False, revision=revision ) print("Customized bigdl-llm loader") - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM model = AutoModelForCausalLM.load_low_bit(model_path) return model, tokenizer diff --git a/python/llm/src/bigdl/llm/serving/fastchat/bigdl_worker.py b/python/llm/src/ipex_llm/serving/fastchat/bigdl_worker.py similarity index 99% rename from python/llm/src/bigdl/llm/serving/fastchat/bigdl_worker.py rename to python/llm/src/ipex_llm/serving/fastchat/bigdl_worker.py index 35c009f7..b357c8d3 100644 --- a/python/llm/src/bigdl/llm/serving/fastchat/bigdl_worker.py +++ b/python/llm/src/ipex_llm/serving/fastchat/bigdl_worker.py @@ -43,7 +43,7 @@ from fastchat.serve.base_model_worker import ( ) from fastchat.utils import get_context_length, is_partial_stop -from bigdl.llm.transformers.loader import load_model +from ipex_llm.transformers.loader import load_model from transformers import TextIteratorStreamer app = FastAPI() diff --git a/python/llm/src/bigdl/llm/serving/fastchat/model_worker.py b/python/llm/src/ipex_llm/serving/fastchat/model_worker.py similarity index 99% rename from python/llm/src/bigdl/llm/serving/fastchat/model_worker.py rename to python/llm/src/ipex_llm/serving/fastchat/model_worker.py index 13b98057..c822769f 100644 --- a/python/llm/src/bigdl/llm/serving/fastchat/model_worker.py +++ b/python/llm/src/ipex_llm/serving/fastchat/model_worker.py @@ -28,7 +28,7 @@ import time from typing import List, Optional import threading import uuid -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError from fastapi import FastAPI, Request, BackgroundTasks from fastapi.responses import StreamingResponse, JSONResponse diff --git a/python/llm/src/bigdl/llm/serving/fastchat/vllm_worker.py b/python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py similarity index 97% rename from python/llm/src/bigdl/llm/serving/fastchat/vllm_worker.py rename to python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py index 780ca2ae..53f99746 100644 --- a/python/llm/src/bigdl/llm/serving/fastchat/vllm_worker.py +++ b/python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py @@ -35,10 +35,10 @@ import uvicorn # from vllm.engine.arg_utils import AsyncEngineArgs # from vllm.sampling_params import SamplingParams # from vllm.utils import random_uuid -from bigdl.llm.vllm.engine.async_llm_engine import AsyncLLMEngine -from bigdl.llm.vllm.engine.arg_utils import AsyncEngineArgs -from bigdl.llm.vllm.sampling_params import SamplingParams -from bigdl.llm.vllm.utils import random_uuid +from ipex_llm.vllm.engine.async_llm_engine import AsyncLLMEngine +from ipex_llm.vllm.engine.arg_utils import AsyncEngineArgs +from ipex_llm.vllm.sampling_params import SamplingParams +from ipex_llm.vllm.utils import random_uuid import numpy as np diff --git a/python/llm/src/bigdl/llm/transformers/__init__.py b/python/llm/src/ipex_llm/transformers/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/__init__.py rename to python/llm/src/ipex_llm/transformers/__init__.py diff --git a/python/llm/src/bigdl/llm/transformers/awq/__init__.py b/python/llm/src/ipex_llm/transformers/awq/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/awq/__init__.py rename to python/llm/src/ipex_llm/transformers/awq/__init__.py diff --git a/python/llm/src/bigdl/llm/transformers/awq/act.py b/python/llm/src/ipex_llm/transformers/awq/act.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/awq/act.py rename to python/llm/src/ipex_llm/transformers/awq/act.py diff --git a/python/llm/src/bigdl/llm/transformers/awq/awq.py b/python/llm/src/ipex_llm/transformers/awq/awq.py similarity index 97% rename from python/llm/src/bigdl/llm/transformers/awq/awq.py rename to python/llm/src/ipex_llm/transformers/awq/awq.py index 308a62f1..c4f822e9 100644 --- a/python/llm/src/bigdl/llm/transformers/awq/awq.py +++ b/python/llm/src/ipex_llm/transformers/awq/awq.py @@ -52,9 +52,9 @@ from transformers.models.opt.modeling_opt import OPTForCausalLM from transformers.models.llama.modeling_llama import LlamaForCausalLM from transformers.models.bloom.modeling_bloom import BloomBlock from transformers import AwqConfig, AutoConfig -from bigdl.llm.transformers.awq.linear import WQLinear_GEMM, WQLinear_GEMV +from ipex_llm.transformers.awq.linear import WQLinear_GEMM, WQLinear_GEMV from huggingface_hub import snapshot_download -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError layer_type_dict = { @@ -151,7 +151,7 @@ def get_layer_type(config): def scale_activations(module): - from bigdl.llm.transformers.awq.act import ScaledActivation + from ipex_llm.transformers.awq.act import ScaledActivation param = next(module.parameters()) dtype = param.dtype device = param.device diff --git a/python/llm/src/bigdl/llm/transformers/awq/awq_config.py b/python/llm/src/ipex_llm/transformers/awq/awq_config.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/awq/awq_config.py rename to python/llm/src/ipex_llm/transformers/awq/awq_config.py index 79d011f5..0f6fe60d 100644 --- a/python/llm/src/bigdl/llm/transformers/awq/awq_config.py +++ b/python/llm/src/ipex_llm/transformers/awq/awq_config.py @@ -34,7 +34,7 @@ # from dataclasses import dataclass -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError from transformers.utils.quantization_config import QuantizationConfigMixin from transformers.utils.quantization_config import AwqBackendPackingMethod,\ AWQLinearVersion, QuantizationMethod diff --git a/python/llm/src/bigdl/llm/transformers/awq/linear.py b/python/llm/src/ipex_llm/transformers/awq/linear.py similarity index 99% rename from python/llm/src/bigdl/llm/transformers/awq/linear.py rename to python/llm/src/ipex_llm/transformers/awq/linear.py index b2e4958f..750ffc53 100644 --- a/python/llm/src/bigdl/llm/transformers/awq/linear.py +++ b/python/llm/src/ipex_llm/transformers/awq/linear.py @@ -43,7 +43,7 @@ import torch import torch.nn as nn -from bigdl.llm.utils.common import invalidOperationError, invalidInputError +from ipex_llm.utils.common import invalidOperationError, invalidInputError from transformers import AwqConfig from transformers.utils.quantization_config import AwqBackendPackingMethod diff --git a/python/llm/src/bigdl/llm/transformers/bmm.py b/python/llm/src/ipex_llm/transformers/bmm.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/bmm.py rename to python/llm/src/ipex_llm/transformers/bmm.py diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py similarity index 90% rename from python/llm/src/bigdl/llm/transformers/convert.py rename to python/llm/src/ipex_llm/transformers/convert.py index f21b7207..820dbecc 100644 --- a/python/llm/src/bigdl/llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -42,12 +42,12 @@ from accelerate import init_empty_weights import warnings import transformers import importlib.util -from bigdl.llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.ggml.quantize import ggml_tensor_qtype from .utils import logger, get_cur_qtype_and_imatrix from typing import Union import numpy as np import os -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError from typing import List, Optional, Tuple, Union @@ -76,7 +76,7 @@ if is_auto_gptq_available(): from auto_gptq.utils.peft_utils import QuantLinearCuda, QuantLinearCudaOld if is_auto_awq_available(): - from bigdl.llm.transformers.awq.linear import WQLinear_GEMM + from ipex_llm.transformers.awq.linear import WQLinear_GEMM from transformers.utils.quantization_config import AwqBackendPackingMethod @@ -120,7 +120,7 @@ def is_linear_module(module): def convert_gptq(module, awq=False, llm_awq=False): - from bigdl.llm.transformers.low_bit_linear import get_block_size + from ipex_llm.transformers.low_bit_linear import get_block_size Q4_1 = get_block_size("asym_int4") scales = module.scales @@ -194,9 +194,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, imatrix_data=None, embedding_qtype=None, model_type=None, torch_dtype=torch.float32, enable_xetla=False): - from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \ + from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \ FP16Linear, BF16Linear - from bigdl.llm.transformers.embedding import LLMEmbedding, LowBitEmbedding + from ipex_llm.transformers.embedding import LLMEmbedding, LowBitEmbedding has_been_replaced = False for name, module in model.named_children(): @@ -288,7 +288,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, optimize_lm_head=optimize_lm_head ) device = module.weight.data.device - from bigdl.llm.transformers.utils import get_ipex_version + from ipex_llm.transformers.utils import get_ipex_version if get_ipex_version() < "2.1.10+xpu": new_linear._parameters['weight'] = nn.Parameter(module.weight) else: @@ -389,7 +389,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, def replace_with_low_bit_linear_for_module(model, qtype, module_name=None, modules_to_not_convert=None, current_key_name=None, convert_shape_only=False, torch_dtype="auto"): - from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \ + from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \ FP16Linear, BF16Linear has_been_replaced = False @@ -479,7 +479,7 @@ def replace_with_low_bit_linear_for_module(model, qtype, module_name=None, mp_group=mp_group, ) device = module.weight.data.device - from bigdl.llm.transformers.utils import get_ipex_version + from ipex_llm.transformers.utils import get_ipex_version if get_ipex_version() < "2.1.10+xpu": new_linear._parameters['weight'] = nn.Parameter(module.weight) else: @@ -592,7 +592,7 @@ def _optimize_pre(model): not model.config.is_decoder and model.config.position_embedding_type == "absolute" ): - from bigdl.llm.transformers.models.bert import merge_qkv + from ipex_llm.transformers.models.bert import merge_qkv model.apply(merge_qkv) if model.config.model_type == "qwen": position_ids = torch.arange(0, model.config.max_position_embeddings) @@ -695,8 +695,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True, # currently put interpolation execution into cpu visual_module_name = model.transformer.visual.__class__.__module__ visual_module = importlib.import_module(visual_module_name) - from bigdl.llm.transformers.models.qwen_vl import qwen_vl_vision_transformer_forward - from bigdl.llm.transformers.models.qwen_vl import qwen_vl_resampler_forward + from ipex_llm.transformers.models.qwen_vl import qwen_vl_vision_transformer_forward + from ipex_llm.transformers.models.qwen_vl import qwen_vl_resampler_forward convert_forward(model, visual_module.VisionTransformer, qwen_vl_vision_transformer_forward @@ -710,7 +710,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True, def convert_bigdl_other_module(model, dtype): # Convert modules outside of bigdl linear to corresponding dtype - from bigdl.llm.transformers.low_bit_linear import LowBitLinear, \ + from ipex_llm.transformers.low_bit_linear import LowBitLinear, \ FP16Linear, BF16Linear for module in model.modules(): if list(module.children()) == []: @@ -745,7 +745,7 @@ def _optimize_ipex(model, qtype=ggml_tensor_qtype["bf16"]): import intel_extension_for_pytorch as ipex from intel_extension_for_pytorch.transformers.optimize import model_convert_reference from transformers.modeling_attn_mask_utils import AttentionMaskConverter - from bigdl.llm.transformers.convert_ipex import ( + from ipex_llm.transformers.convert_ipex import ( _ipex_optimize_model, _ipex_jit, _make_causal_mask, _llama_model_forward_4_35, convert_function, GLM_get_masks, ) @@ -780,13 +780,13 @@ def _optimize_ipex(model, qtype=ggml_tensor_qtype["bf16"]): def _optimize_post(model, lightweight_bmm=False): from packaging import version - from bigdl.llm.transformers.models.llama import llama_attention_forward_4_31 - from bigdl.llm.transformers.models.llama import llama_attention_selective_batching_forward_4_31 - from bigdl.llm.transformers.models.llama import llama_model_selective_batching_forward_4_31 - from bigdl.llm.transformers.models.llama import llama_rms_norm_forward - from bigdl.llm.transformers.models.llama import llama_mlp_forward - from bigdl.llm.transformers.models.llama import llama_decoder_forward - from bigdl.llm.transformers.models.llama import llama_model_forward + from ipex_llm.transformers.models.llama import llama_attention_forward_4_31 + from ipex_llm.transformers.models.llama import llama_attention_selective_batching_forward_4_31 + from ipex_llm.transformers.models.llama import llama_model_selective_batching_forward_4_31 + from ipex_llm.transformers.models.llama import llama_rms_norm_forward + from ipex_llm.transformers.models.llama import llama_mlp_forward + from ipex_llm.transformers.models.llama import llama_decoder_forward + from ipex_llm.transformers.models.llama import llama_model_forward from transformers.modeling_utils import PreTrainedModel # All huggingface format models are inherited from `PreTrainedModel` @@ -813,8 +813,8 @@ def _optimize_post(model, lightweight_bmm=False): llama_decoder_forward) if version.parse(trans_version) >= version.parse("4.36.0"): # transformers version >= 4.36.0 - from bigdl.llm.transformers.models.llama import llama_attention_forward_4_36 - from bigdl.llm.transformers.models.llama import llama_model_forward_4_36 + from ipex_llm.transformers.models.llama import llama_attention_forward_4_36 + from ipex_llm.transformers.models.llama import llama_model_forward_4_36 convert_forward( model, transformers.models.llama.modeling_llama.LlamaAttention, @@ -850,7 +850,7 @@ def _optimize_post(model, lightweight_bmm=False): pass # convert all nn.LayerNorm - from bigdl.llm.transformers.models.bloom import bloom_layer_norm_forward + from ipex_llm.transformers.models.bloom import bloom_layer_norm_forward convert_forward(model, nn.LayerNorm, bloom_layer_norm_forward) @@ -861,7 +861,7 @@ def _optimize_post(model, lightweight_bmm=False): # chatglm2-6b-32k modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.chatglm2_32k import chatglm2_32k_attention_forward + from ipex_llm.transformers.models.chatglm2_32k import chatglm2_32k_attention_forward convert_forward(model, module.SelfAttention, chatglm2_32k_attention_forward) @@ -870,9 +870,9 @@ def _optimize_post(model, lightweight_bmm=False): # chatglm2-6b modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.chatglm2 import chatglm2_attention_forward - from bigdl.llm.transformers.models.chatglm2 import chatglm_rms_norm_forward - from bigdl.llm.transformers.models.chatglm2 import chatglm2_model_forward + from ipex_llm.transformers.models.chatglm2 import chatglm2_attention_forward + from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward + from ipex_llm.transformers.models.chatglm2 import chatglm2_model_forward convert_forward(model, module.SelfAttention, chatglm2_attention_forward) @@ -886,7 +886,7 @@ def _optimize_post(model, lightweight_bmm=False): # chatglm-6b modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.chatglm import chatglm_attention_forward + from ipex_llm.transformers.models.chatglm import chatglm_attention_forward convert_forward(model, module.SelfAttention, chatglm_attention_forward @@ -896,7 +896,7 @@ def _optimize_post(model, lightweight_bmm=False): modeling_module_name = model.__class__.__module__ attention_module_name = '.'.join(modeling_module_name.split('.')[:-1]) + ".attention" module = importlib.import_module(attention_module_name) - from bigdl.llm.transformers.models.mpt import mpt_multihead_attention_forward + from ipex_llm.transformers.models.mpt import mpt_multihead_attention_forward convert_forward(model, module.MultiheadAttention, mpt_multihead_attention_forward @@ -905,7 +905,7 @@ def _optimize_post(model, lightweight_bmm=False): # dolly-v1-6b modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.gptj import gptj_attention_forward, gptj_model_forward,\ + from ipex_llm.transformers.models.gptj import gptj_attention_forward, gptj_model_forward,\ gptj_block_forward convert_forward(model, module.GPTJAttention, @@ -919,7 +919,7 @@ def _optimize_post(model, lightweight_bmm=False): elif "bloom" in model.config.model_type: modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.bloom import bloom_attention_forward + from ipex_llm.transformers.models.bloom import bloom_attention_forward convert_forward(model, module.BloomAttention, bloom_attention_forward @@ -931,7 +931,7 @@ def _optimize_post(model, lightweight_bmm=False): if "RWForCausalLM" in model.config.architectures: if model.config.hidden_size == 4544: # falcon-7b need to check performance drop after kv cache support. - # from bigdl.llm.transformers.models.falcon import rw_attention_forward_7b + # from ipex_llm.transformers.models.falcon import rw_attention_forward_7b # convert_forward(model, # module.Attention, # rw_attention_forward_7b @@ -939,7 +939,7 @@ def _optimize_post(model, lightweight_bmm=False): pass else: # falcon-40b - from bigdl.llm.transformers.models.falcon import rw_attention_forward_40b + from ipex_llm.transformers.models.falcon import rw_attention_forward_40b convert_forward(model, module.Attention, rw_attention_forward_40b @@ -949,7 +949,7 @@ def _optimize_post(model, lightweight_bmm=False): # falcon-180b and new falcon-40b if version.parse(trans_version) >= version.parse("4.36.0"): # transformers version >= 4.36.0 - from bigdl.llm.transformers.models.falcon import \ + from ipex_llm.transformers.models.falcon import \ falcon_attention_forward_4_36 convert_forward(model, @@ -957,7 +957,7 @@ def _optimize_post(model, lightweight_bmm=False): falcon_attention_forward_4_36 ) else: - from bigdl.llm.transformers.models.falcon import falcon_attention_forward + from ipex_llm.transformers.models.falcon import falcon_attention_forward convert_forward(model, module.FalconAttention, falcon_attention_forward @@ -969,8 +969,8 @@ def _optimize_post(model, lightweight_bmm=False): # baichuan2-7B modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.baichuan2 import baichuan_attention_forward_7b - from bigdl.llm.transformers.models.baichuan2 import baichuan_mlp_forward + from ipex_llm.transformers.models.baichuan2 import baichuan_attention_forward_7b + from ipex_llm.transformers.models.baichuan2 import baichuan_mlp_forward convert_forward(model, module.Attention, baichuan_attention_forward_7b @@ -985,10 +985,10 @@ def _optimize_post(model, lightweight_bmm=False): # baichuan2-13B modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.baichuan2 import baichuan_attention_forward_13b - from bigdl.llm.transformers.models.baichuan2 import baichuan_13b_rms_norm_forward - from bigdl.llm.transformers.models.baichuan2 import baichuan_mlp_forward - from bigdl.llm.transformers.models.baichuan2 import baichuan_13b_get_alibi_mask + from ipex_llm.transformers.models.baichuan2 import baichuan_attention_forward_13b + from ipex_llm.transformers.models.baichuan2 import baichuan_13b_rms_norm_forward + from ipex_llm.transformers.models.baichuan2 import baichuan_mlp_forward + from ipex_llm.transformers.models.baichuan2 import baichuan_13b_get_alibi_mask convert_forward(model, module.BaichuanAttention, baichuan_attention_forward_13b @@ -1010,7 +1010,7 @@ def _optimize_post(model, lightweight_bmm=False): # baichuan-7B modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.baichuan import baichuan_attention_forward_7b + from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_7b convert_forward(model, module.Attention, baichuan_attention_forward_7b @@ -1022,8 +1022,8 @@ def _optimize_post(model, lightweight_bmm=False): # baichuan-13B modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.baichuan import baichuan_attention_forward_13b - from bigdl.llm.transformers.models.baichuan2 import baichuan_13b_rms_norm_forward + from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_13b + from ipex_llm.transformers.models.baichuan2 import baichuan_13b_rms_norm_forward convert_forward(model, module.BaichuanAttention, baichuan_attention_forward_13b @@ -1033,7 +1033,7 @@ def _optimize_post(model, lightweight_bmm=False): module.RMSNorm, baichuan_13b_rms_norm_forward) elif model.config.model_type == "gpt_neox": - from bigdl.llm.transformers.models.gptneox import gptneox_attention_forward + from ipex_llm.transformers.models.gptneox import gptneox_attention_forward convert_forward(model, transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXAttention, gptneox_attention_forward @@ -1041,8 +1041,8 @@ def _optimize_post(model, lightweight_bmm=False): elif model.config.model_type == "internlm": modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.internlm import internlm_attention_forward - from bigdl.llm.transformers.models.internlm import internlm2_attention_forward + from ipex_llm.transformers.models.internlm import internlm_attention_forward + from ipex_llm.transformers.models.internlm import internlm2_attention_forward try: convert_forward(model, module.InternLM2Attention, @@ -1068,7 +1068,7 @@ def _optimize_post(model, lightweight_bmm=False): # for Qwen-VL-Chat modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.qwen_vl import qwen_attention_forward_vl + from ipex_llm.transformers.models.qwen_vl import qwen_attention_forward_vl convert_forward(model, module.QWenAttention, qwen_attention_forward_vl @@ -1077,10 +1077,10 @@ def _optimize_post(model, lightweight_bmm=False): # for Qwen-7B and Qwen-14B modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.qwen import qwen_attention_forward - from bigdl.llm.transformers.models.qwen import qwen_mlp_forward - from bigdl.llm.transformers.models.chatglm2 import chatglm_rms_norm_forward - from bigdl.llm.transformers.models.qwen import qwen_model_forward + from ipex_llm.transformers.models.qwen import qwen_attention_forward + from ipex_llm.transformers.models.qwen import qwen_mlp_forward + from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward + from ipex_llm.transformers.models.qwen import qwen_model_forward convert_forward(model, module.QWenAttention, qwen_attention_forward @@ -1098,8 +1098,8 @@ def _optimize_post(model, lightweight_bmm=False): # for Qwen1.5-7B modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.qwen2 import qwen2_model_forward - from bigdl.llm.transformers.models.qwen2 import qwen2_attention_forward + from ipex_llm.transformers.models.qwen2 import qwen2_model_forward + from ipex_llm.transformers.models.qwen2 import qwen2_attention_forward convert_forward(model, module.Qwen2Model, qwen2_model_forward) @@ -1115,7 +1115,7 @@ def _optimize_post(model, lightweight_bmm=False): elif model.config.model_type == "aquila": modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.aquila import aquila_attention_forward + from ipex_llm.transformers.models.aquila import aquila_attention_forward convert_forward(model, module.AquilaAttention, aquila_attention_forward @@ -1130,7 +1130,7 @@ def _optimize_post(model, lightweight_bmm=False): "to run Mixtral models.") modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.mixtral import mixtral_moeblock_forward, \ + from ipex_llm.transformers.models.mixtral import mixtral_moeblock_forward, \ mixtral_attention_forward, mixtral_mlp_forward, mixtral_model_forward convert_forward(model, module.MixtralAttention, @@ -1153,7 +1153,7 @@ def _optimize_post(model, lightweight_bmm=False): # For phixtral, limit the condition to avoid applying on phi-2 hosted by ModelScope modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.phixtral import phixtral_moeblock_forward, \ + from ipex_llm.transformers.models.phixtral import phixtral_moeblock_forward, \ phixtral_mlp_forward convert_forward(model, module.MoE, @@ -1177,8 +1177,8 @@ def _optimize_post(model, lightweight_bmm=False): if version.parse(trans_version) >= version.parse("4.36.0"): modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.mistral import mistral_attention_forward_4_36 - from bigdl.llm.transformers.models.mistral import mistral_model_forward_4_36 + from ipex_llm.transformers.models.mistral import mistral_attention_forward_4_36 + from ipex_llm.transformers.models.mistral import mistral_model_forward_4_36 convert_forward(model, module.MistralAttention, mistral_attention_forward_4_36 @@ -1196,7 +1196,7 @@ def _optimize_post(model, lightweight_bmm=False): else: modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.mistral import mistral_attention_forward + from ipex_llm.transformers.models.mistral import mistral_attention_forward convert_forward(model, module.MistralAttention, mistral_attention_forward @@ -1210,9 +1210,9 @@ def _optimize_post(model, lightweight_bmm=False): elif model.config.model_type == "gemma": modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.gemma import gemma_attention_forward - from bigdl.llm.transformers.models.gemma import gemma_rms_norm_forward - from bigdl.llm.transformers.models.gemma import gemma_mlp_forward + from ipex_llm.transformers.models.gemma import gemma_attention_forward + from ipex_llm.transformers.models.gemma import gemma_rms_norm_forward + from ipex_llm.transformers.models.gemma import gemma_mlp_forward convert_forward(model, module.GemmaAttention, gemma_attention_forward, @@ -1231,7 +1231,7 @@ def _optimize_post(model, lightweight_bmm=False): llama_rms_norm_forward) elif model.config.model_type == "whisper" and lightweight_bmm: if platform.system().lower() == 'windows': - from bigdl.llm.transformers.bmm import SafeBMM + from ipex_llm.transformers.bmm import SafeBMM modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) old_fwd = module.WhisperAttention.forward @@ -1247,8 +1247,8 @@ def _optimize_post(model, lightweight_bmm=False): # rwkv v4 modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.rwkv4 import rwkv_attention_forward - from bigdl.llm.transformers.models.rwkv4 import rwkv_ffn_forward + from ipex_llm.transformers.models.rwkv4 import rwkv_attention_forward + from ipex_llm.transformers.models.rwkv4 import rwkv_ffn_forward convert_forward(model, module.RwkvSelfAttention, rwkv_attention_forward) @@ -1259,9 +1259,9 @@ def _optimize_post(model, lightweight_bmm=False): # rwkv v5 modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.rwkv5 import rwkv_attention_forward - from bigdl.llm.transformers.models.rwkv5 import rwkv_ffn_forward_wrapper - from bigdl.llm.transformers.models.rwkv5 import rwkv_model_forward_wrapper + from ipex_llm.transformers.models.rwkv5 import rwkv_attention_forward + from ipex_llm.transformers.models.rwkv5 import rwkv_ffn_forward_wrapper + from ipex_llm.transformers.models.rwkv5 import rwkv_model_forward_wrapper convert_forward(model, module.RwkvSelfAttention, rwkv_attention_forward) @@ -1276,7 +1276,7 @@ def _optimize_post(model, lightweight_bmm=False): elif model.config.model_type == "deci": modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.decilm import decilm_attention_forward_4_35_2 + from ipex_llm.transformers.models.decilm import decilm_attention_forward_4_35_2 convert_forward(model, module.LlamaRMSNorm, llama_rms_norm_forward) @@ -1290,8 +1290,8 @@ def _optimize_post(model, lightweight_bmm=False): # starcoder modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.gptbigcode import _attn_wrapper - from bigdl.llm.transformers.models.gptbigcode import gptbigcode_attention_forward + from ipex_llm.transformers.models.gptbigcode import _attn_wrapper + from ipex_llm.transformers.models.gptbigcode import gptbigcode_attention_forward convert_forward(model, module.GPTBigCodeAttention, gptbigcode_attention_forward) @@ -1303,8 +1303,8 @@ def _optimize_post(model, lightweight_bmm=False): elif model.config.model_type == 'yuan': modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.yuan import yuan_attention_forward - # from bigdl.llm.transformers.models.yuan import yuan_mlp_forward + from ipex_llm.transformers.models.yuan import yuan_attention_forward + # from ipex_llm.transformers.models.yuan import yuan_mlp_forward convert_forward(model, module.YuanAttention, yuan_attention_forward @@ -1320,8 +1320,8 @@ def _optimize_post(model, lightweight_bmm=False): ): modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - from bigdl.llm.transformers.models.bert import self_attention_forward - from bigdl.llm.transformers.models.bert import encoder_forward + from ipex_llm.transformers.models.bert import self_attention_forward + from ipex_llm.transformers.models.bert import encoder_forward convert_forward(model, module.BertSelfAttention, self_attention_forward) diff --git a/python/llm/src/bigdl/llm/transformers/convert_ipex.py b/python/llm/src/ipex_llm/transformers/convert_ipex.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/convert_ipex.py rename to python/llm/src/ipex_llm/transformers/convert_ipex.py index b50f6c04..4d6764bf 100644 --- a/python/llm/src/bigdl/llm/transformers/convert_ipex.py +++ b/python/llm/src/ipex_llm/transformers/convert_ipex.py @@ -35,7 +35,7 @@ # limitations under the License. import torch -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError from typing import List, Optional, Tuple, Union from intel_extension_for_pytorch.transformers.optimize import ( lowering_class_cpu, @@ -46,8 +46,8 @@ from intel_extension_for_pytorch.cpu._auto_kernel_selection import ( _using_tpp, _disable_tpp ) -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.transformers.convert import get_enable_ipex +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.transformers.convert import get_enable_ipex import os diff --git a/python/llm/src/bigdl/llm/transformers/embedding.py b/python/llm/src/ipex_llm/transformers/embedding.py similarity index 97% rename from python/llm/src/bigdl/llm/transformers/embedding.py rename to python/llm/src/ipex_llm/transformers/embedding.py index 4054418f..8031e020 100644 --- a/python/llm/src/bigdl/llm/transformers/embedding.py +++ b/python/llm/src/ipex_llm/transformers/embedding.py @@ -20,8 +20,8 @@ from torch import Tensor from torch.nn import functional as F from torch.nn import Parameter from typing import Optional -from bigdl.llm.transformers.low_bit_linear import FP4Params -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.transformers.low_bit_linear import FP4Params +from ipex_llm.utils.common import invalidInputError # To prevent insufficient available memory when moving embedding from XPU back to CPU, diff --git a/python/llm/src/bigdl/llm/transformers/gguf/__init__.py b/python/llm/src/ipex_llm/transformers/gguf/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/__init__.py rename to python/llm/src/ipex_llm/transformers/gguf/__init__.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/api.py b/python/llm/src/ipex_llm/transformers/gguf/api.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/gguf/api.py rename to python/llm/src/ipex_llm/transformers/gguf/api.py index 020a91ba..05203fe0 100644 --- a/python/llm/src/bigdl/llm/transformers/gguf/api.py +++ b/python/llm/src/ipex_llm/transformers/gguf/api.py @@ -15,7 +15,7 @@ # import torch -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError qtype_map = { diff --git a/python/llm/src/bigdl/llm/transformers/gguf/gguf.py b/python/llm/src/ipex_llm/transformers/gguf/gguf.py similarity index 99% rename from python/llm/src/bigdl/llm/transformers/gguf/gguf.py rename to python/llm/src/ipex_llm/transformers/gguf/gguf.py index 8d80e5f8..199299f2 100644 --- a/python/llm/src/bigdl/llm/transformers/gguf/gguf.py +++ b/python/llm/src/ipex_llm/transformers/gguf/gguf.py @@ -25,7 +25,7 @@ import numpy from io import BufferedReader from tqdm import tqdm -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError class GGUFReader: diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/__init__.py b/python/llm/src/ipex_llm/transformers/gguf/models/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/__init__.py rename to python/llm/src/ipex_llm/transformers/gguf/models/__init__.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/baichuan.py b/python/llm/src/ipex_llm/transformers/gguf/models/baichuan.py similarity index 97% rename from python/llm/src/bigdl/llm/transformers/gguf/models/baichuan.py rename to python/llm/src/ipex_llm/transformers/gguf/models/baichuan.py index 49e81d48..effea840 100644 --- a/python/llm/src/bigdl/llm/transformers/gguf/models/baichuan.py +++ b/python/llm/src/ipex_llm/transformers/gguf/models/baichuan.py @@ -24,8 +24,8 @@ from .model_implement.baichuan.modeling_baichuan import BaiChuanForCausalLM from .model_implement.baichuan.tokenization_baichuan import BaiChuanTokenizer from ..gguf import GGUFFileLoader -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.transformers.convert import replace_with_low_bit_linear_for_module +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.transformers.convert import replace_with_low_bit_linear_for_module def load_gguf_baichuan(loader: GGUFFileLoader, dtype: torch.dtype = torch.float, diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/bloom.py b/python/llm/src/ipex_llm/transformers/gguf/models/bloom.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/bloom.py rename to python/llm/src/ipex_llm/transformers/gguf/models/bloom.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/falcon.py b/python/llm/src/ipex_llm/transformers/gguf/models/falcon.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/falcon.py rename to python/llm/src/ipex_llm/transformers/gguf/models/falcon.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/llama.py b/python/llm/src/ipex_llm/transformers/gguf/models/llama.py similarity index 97% rename from python/llm/src/bigdl/llm/transformers/gguf/models/llama.py rename to python/llm/src/ipex_llm/transformers/gguf/models/llama.py index f40eeab3..86cdadfb 100644 --- a/python/llm/src/bigdl/llm/transformers/gguf/models/llama.py +++ b/python/llm/src/ipex_llm/transformers/gguf/models/llama.py @@ -22,8 +22,8 @@ from tempfile import NamedTemporaryFile from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer from ..gguf import GGUFFileLoader -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.transformers.convert import replace_with_low_bit_linear_for_module +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.transformers.convert import replace_with_low_bit_linear_for_module def load_gguf_llama(loader: GGUFFileLoader, dtype: torch.dtype = torch.float, diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/mistral.py b/python/llm/src/ipex_llm/transformers/gguf/models/mistral.py similarity index 97% rename from python/llm/src/bigdl/llm/transformers/gguf/models/mistral.py rename to python/llm/src/ipex_llm/transformers/gguf/models/mistral.py index ba1feae4..d67ec3c8 100644 --- a/python/llm/src/bigdl/llm/transformers/gguf/models/mistral.py +++ b/python/llm/src/ipex_llm/transformers/gguf/models/mistral.py @@ -22,8 +22,8 @@ from tempfile import NamedTemporaryFile from transformers import MistralConfig, MistralForCausalLM, LlamaTokenizer from ..gguf import GGUFFileLoader -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.transformers.convert import replace_with_low_bit_linear_for_module +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.transformers.convert import replace_with_low_bit_linear_for_module def load_gguf_mistral(loader: GGUFFileLoader, dtype: torch.dtype = torch.float, diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py b/python/llm/src/ipex_llm/transformers/gguf/models/mixtral.py similarity index 97% rename from python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py rename to python/llm/src/ipex_llm/transformers/gguf/models/mixtral.py index 23fc70a3..62a24b3f 100644 --- a/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py +++ b/python/llm/src/ipex_llm/transformers/gguf/models/mixtral.py @@ -22,8 +22,8 @@ from tempfile import NamedTemporaryFile from transformers import MixtralConfig, MixtralForCausalLM, LlamaTokenizer from ..gguf import GGUFFileLoader -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.transformers.convert import replace_with_low_bit_linear_for_module +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.transformers.convert import replace_with_low_bit_linear_for_module def load_gguf_mixtral(loader: GGUFFileLoader, dtype: torch.dtype = torch.float, diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/__init__.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/__init__.py rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/__init__.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/configuration_baichuan.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/configuration_baichuan.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/configuration_baichuan.py rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/configuration_baichuan.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/modeling_baichuan.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/modeling_baichuan.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/modeling_baichuan.py rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/modeling_baichuan.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/tokenization_baichuan.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/tokenization_baichuan.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/tokenization_baichuan.py rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/tokenization_baichuan.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/bloom/tokenizer.json b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/bloom/tokenizer.json similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/bloom/tokenizer.json rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/bloom/tokenizer.json diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/falcon/tokenizer.json b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/falcon/tokenizer.json similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/falcon/tokenizer.json rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/falcon/tokenizer.json diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/mpt/tokenizer.json b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/mpt/tokenizer.json similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/mpt/tokenizer.json rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/mpt/tokenizer.json diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/__init__.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/__init__.py rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/__init__.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/mpt.py b/python/llm/src/ipex_llm/transformers/gguf/models/mpt.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/mpt.py rename to python/llm/src/ipex_llm/transformers/gguf/models/mpt.py diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/yuan2.py b/python/llm/src/ipex_llm/transformers/gguf/models/yuan2.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/gguf/models/yuan2.py rename to python/llm/src/ipex_llm/transformers/gguf/models/yuan2.py diff --git a/python/llm/src/bigdl/llm/transformers/kv.py b/python/llm/src/ipex_llm/transformers/kv.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/kv.py rename to python/llm/src/ipex_llm/transformers/kv.py diff --git a/python/llm/src/bigdl/llm/transformers/layers/rope_embedding.py b/python/llm/src/ipex_llm/transformers/layers/rope_embedding.py similarity index 93% rename from python/llm/src/bigdl/llm/transformers/layers/rope_embedding.py rename to python/llm/src/ipex_llm/transformers/layers/rope_embedding.py index b3af61dd..0c6c3714 100644 --- a/python/llm/src/bigdl/llm/transformers/layers/rope_embedding.py +++ b/python/llm/src/ipex_llm/transformers/layers/rope_embedding.py @@ -16,10 +16,10 @@ import torch import logging -from bigdl.llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd +from ipex_llm.utils.common import invalidInputError -LOG = logging.getLogger("bigdl.llm.rope_embedding") +LOG = logging.getLogger("ipex_llm.rope_embedding") # Fast RoPE for finetuning, split the q and k diff --git a/python/llm/src/bigdl/llm/transformers/load_config.yaml b/python/llm/src/ipex_llm/transformers/load_config.yaml similarity index 100% rename from python/llm/src/bigdl/llm/transformers/load_config.yaml rename to python/llm/src/ipex_llm/transformers/load_config.yaml diff --git a/python/llm/src/bigdl/llm/transformers/loader.py b/python/llm/src/ipex_llm/transformers/loader.py similarity index 97% rename from python/llm/src/bigdl/llm/transformers/loader.py rename to python/llm/src/ipex_llm/transformers/loader.py index 4c3250f8..2876f896 100644 --- a/python/llm/src/bigdl/llm/transformers/loader.py +++ b/python/llm/src/ipex_llm/transformers/loader.py @@ -17,11 +17,11 @@ import torch -from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel, get_enable_ipex +from ipex_llm.transformers import AutoModelForCausalLM, AutoModel, get_enable_ipex import time from datetime import date import argparse -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer LLAMA_IDS = ['llama', 'vicuna', 'merged-baize'] diff --git a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/low_bit_linear.py rename to python/llm/src/ipex_llm/transformers/low_bit_linear.py index 702e7da5..f59724ba 100644 --- a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -42,22 +42,22 @@ from typing import Optional, TypeVar, Union, overload -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError import os import torch import torch.nn.functional as F from torch import Tensor, device, dtype, nn from operator import mul from functools import reduce -from bigdl.llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd -from bigdl.llm.transformers.utils import get_autocast_dtype, get_xpu_device_type, \ +from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd +from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_type, \ get_ipex_version T = TypeVar("T", bound="torch.nn.Module") -import bigdl.llm.ggml.model.llama.llama_cpp as ggml +import ipex_llm.ggml.model.llama.llama_cpp as ggml import ctypes -from bigdl.llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.ggml.quantize import ggml_tensor_qtype TORCH_LINEAR_THRESHOLD = int(os.getenv("BIGDL_LLM_LINEAR_THRESHOLD", "512")) SYM_INT4 = ggml_tensor_qtype["sym_int4"] @@ -88,7 +88,7 @@ Q2_K = ggml_tensor_qtype["q2_k"] # Note this format cannot be used directly in IPEX's mm_int4, which expects # row major but packing two consecutive columns. def q4_0_xpu_transpose(ggml_weight, weight_shape): - from bigdl.llm.transformers.low_bit_linear import get_block_size + from ipex_llm.transformers.low_bit_linear import get_block_size Q4_0 = get_block_size("sym_int4") n, k = weight_shape @@ -586,7 +586,7 @@ class LowBitLinear(nn.Linear): try: import intel_extension_for_pytorch import linear_q4_0 - from bigdl.llm.transformers.models.utils import use_xmx + from ipex_llm.transformers.models.utils import use_xmx except ModuleNotFoundError: invalidInputError(False, "Please `pip install bigdl_core_xe` first.") @@ -646,7 +646,7 @@ class LowBitLinear(nn.Linear): if self.training and x.requires_grad: result = MatMulLowBitCPU.apply(x, self.weight) else: - from bigdl.llm.utils.isa_checker import is_server, is_spr + from ipex_llm.utils.isa_checker import is_server, is_spr # convert if necessary, and compute a linear result if is_server() and (not is_spr()) and \ diff --git a/python/llm/src/bigdl/llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/model.py rename to python/llm/src/ipex_llm/transformers/model.py index 374d7887..f4c5bd25 100644 --- a/python/llm/src/bigdl/llm/transformers/model.py +++ b/python/llm/src/ipex_llm/transformers/model.py @@ -42,9 +42,9 @@ from transformers.configuration_utils import PretrainedConfig from .utils import extract_local_archive_file, \ load_state_dict, \ get_local_shard_files, load_imatrix_data -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.gguf.api import load_gguf_model +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.gguf.api import load_gguf_model import torch import warnings import copy @@ -94,7 +94,7 @@ def save_low_bit(self, *args, **kwargs): def _load_pre(): from transformers import GPTJModel - from bigdl.llm.transformers.models.gptj import gptj_model_new_init + from ipex_llm.transformers.models.gptj import gptj_model_new_init GPTJModel.__init__ = gptj_model_new_init @@ -218,7 +218,7 @@ class _BaseAutoModelClass: kwargs["modules_to_not_convert"] = ["lm_head"] load_in_8bit = kwargs.pop("load_in_8bit", False) - from bigdl.llm.llm_patching import bigdl_patched + from ipex_llm.llm_patching import bigdl_patched if bigdl_patched == 'Train': global patched_training_mode if load_in_low_bit == "nf4" or load_in_low_bit == "sym_int4" or load_in_4bit: @@ -234,7 +234,7 @@ class _BaseAutoModelClass: if load_in_4bit or load_in_low_bit: if config_dict.get("quantization_config", None) is not None: - from bigdl.llm.transformers.low_bit_linear import get_block_size + from ipex_llm.transformers.low_bit_linear import get_block_size q_config = config_dict["quantization_config"] if q_config["quant_method"] == "gptq": invalidInputError(q_config["bits"] == 4, @@ -260,7 +260,7 @@ class _BaseAutoModelClass: user_quantization_config = GPTQConfig(bits=4, use_exllama=False) kwargs["quantization_config"] = user_quantization_config elif q_config["quant_method"] == "awq": - from bigdl.llm.transformers.awq.awq_config import AwqConfig + from ipex_llm.transformers.awq.awq_config import AwqConfig awq_config = AwqConfig.from_dict(q_config) invalidInputError(awq_config.bits == 4, "Only 4-bit awq is supported in bigdl-llm.") @@ -347,7 +347,7 @@ class _BaseAutoModelClass: :return: An optimized bigdl-llm model and a huggingface tokenizer """ - from bigdl.llm.optimize import optimize_model as optimize_model_fn + from ipex_llm.optimize import optimize_model as optimize_model_fn model, tokenizer = load_gguf_model(fpath, dtype=torch.half, low_bit=low_bit) model = optimize_model_fn(model, low_bit=low_bit, optimize_llm=optimize_model, @@ -390,7 +390,7 @@ class _BaseAutoModelClass: # https://github.com/casper-hansen/AutoAWQ/blob/main/awq/models/base.py#L147 from accelerate import init_empty_weights, infer_auto_device_map, \ load_checkpoint_in_model - from bigdl.llm.transformers.awq.awq import _replace_with_awq_layers, \ + from ipex_llm.transformers.awq.awq import _replace_with_awq_layers, \ get_layer_type, _load_config awq_config = quant_config model_weights_path, config = _load_config(args[0], '', max_new_tokens=None, diff --git a/python/llm/src/bigdl/llm/transformers/modelling_bigdl.py b/python/llm/src/ipex_llm/transformers/modelling_bigdl.py similarity index 91% rename from python/llm/src/bigdl/llm/transformers/modelling_bigdl.py rename to python/llm/src/ipex_llm/transformers/modelling_bigdl.py index 4c7ba671..e81b6fdc 100644 --- a/python/llm/src/bigdl/llm/transformers/modelling_bigdl.py +++ b/python/llm/src/ipex_llm/transformers/modelling_bigdl.py @@ -22,7 +22,7 @@ import importlib import logging -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError from .model import * @@ -40,7 +40,7 @@ class BigdlNativeForCausalLM: **kwargs): """ :param pretrained_model_name_or_path: Path for converted BigDL-LLM optimized ggml - binary checkpoint. The checkpoint should be converted by ``bigdl.llm.llm_convert``. + binary checkpoint. The checkpoint should be converted by ``ipex_llm.llm_convert``. :param model_family: The model family of the pretrained checkpoint. Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"``, ``"starcoder"`` and ``"chatglm"``. @@ -67,19 +67,19 @@ class BigdlNativeForCausalLM: ggml_model_path = pretrained_model_name_or_path if model_family == 'llama': - from bigdl.llm.ggml.model.llama import Llama + from ipex_llm.ggml.model.llama import Llama return Llama(model_path=ggml_model_path, **kwargs) elif model_family == 'gptneox': - from bigdl.llm.ggml.model.gptneox import Gptneox + from ipex_llm.ggml.model.gptneox import Gptneox return Gptneox(model_path=ggml_model_path, **kwargs) elif model_family == 'bloom': - from bigdl.llm.ggml.model.bloom import Bloom + from ipex_llm.ggml.model.bloom import Bloom return Bloom(model_path=ggml_model_path, **kwargs) elif model_family == 'starcoder': - from bigdl.llm.ggml.model.starcoder import Starcoder + from ipex_llm.ggml.model.starcoder import Starcoder return Starcoder(model_path=ggml_model_path, **kwargs) elif model_family == 'chatglm': - from bigdl.llm.ggml.model.chatglm import ChatGLM + from ipex_llm.ggml.model.chatglm import ChatGLM return ChatGLM(model_path=ggml_model_path, **kwargs) @@ -98,7 +98,7 @@ class _BaseGGMLClass: """ :param pretrained_model_name_or_path: Path for model checkpoint. If running with ``native int4``, the path should be converted BigDL-LLM optimized - ggml binary checkpoint, which should be converted by ``bigdl.llm.llm_convert``. + ggml binary checkpoint, which should be converted by ``ipex_llm.llm_convert``. If running with ``transformers int4``, the path should be the huggingface repo id to be downloaded or the huggingface checkpoint folder. :param native: Load model to either BigDL-LLM optimized Transformer or Native (ggml) int4. @@ -132,30 +132,30 @@ class _BaseGGMLClass: class LlamaForCausalLM(_BaseGGMLClass): - GGML_Module = "bigdl.llm.models" + GGML_Module = "ipex_llm.models" GGML_Model = "Llama" HF_Class = AutoModelForCausalLM class ChatGLMForCausalLM(_BaseGGMLClass): - GGML_Module = "bigdl.llm.ggml.model.chatglm" + GGML_Module = "ipex_llm.ggml.model.chatglm" GGML_Model = "ChatGLM" HF_Class = AutoModel class GptneoxForCausalLM(_BaseGGMLClass): - GGML_Module = "bigdl.llm.models" + GGML_Module = "ipex_llm.models" GGML_Model = "Gptneox" HF_Class = AutoModelForCausalLM class BloomForCausalLM(_BaseGGMLClass): - GGML_Module = "bigdl.llm.models" + GGML_Module = "ipex_llm.models" GGML_Model = "Bloom" HF_Class = AutoModelForCausalLM class StarcoderForCausalLM(_BaseGGMLClass): - GGML_Module = "bigdl.llm.models" + GGML_Module = "ipex_llm.models" GGML_Model = "Starcoder" HF_Class = AutoModelForCausalLM diff --git a/python/llm/src/bigdl/llm/transformers/models/__init__.py b/python/llm/src/ipex_llm/transformers/models/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/models/__init__.py rename to python/llm/src/ipex_llm/transformers/models/__init__.py diff --git a/python/llm/src/bigdl/llm/transformers/models/aquila.py b/python/llm/src/ipex_llm/transformers/models/aquila.py similarity index 96% rename from python/llm/src/bigdl/llm/transformers/models/aquila.py rename to python/llm/src/ipex_llm/transformers/models/aquila.py index 68ca7a01..1b1d252a 100644 --- a/python/llm/src/bigdl/llm/transformers/models/aquila.py +++ b/python/llm/src/ipex_llm/transformers/models/aquila.py @@ -42,11 +42,11 @@ import torch import torch.utils.checkpoint from torch import nn -from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, \ +from ipex_llm.transformers.models.utils import extend_kv_cache, init_kv_cache, \ append_kv_cache, is_enough_kv_cache_room_4_31 -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu -from bigdl.llm.utils.common import log4Error +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu +from ipex_llm.utils.common import log4Error KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/baichuan.py b/python/llm/src/ipex_llm/transformers/models/baichuan.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/models/baichuan.py rename to python/llm/src/ipex_llm/transformers/models/baichuan.py index 29e7968f..0c9e8216 100644 --- a/python/llm/src/bigdl/llm/transformers/models/baichuan.py +++ b/python/llm/src/ipex_llm/transformers/models/baichuan.py @@ -26,14 +26,14 @@ import torch.utils.checkpoint from torch import nn import torch.nn.functional as F from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \ +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \ append_kv_cache, is_enough_kv_cache_room_4_31 -from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ +from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ restore_fp8_kv_cache, use_quantize_kv_cache -from bigdl.llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu +from ipex_llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/baichuan2.py b/python/llm/src/ipex_llm/transformers/models/baichuan2.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/models/baichuan2.py rename to python/llm/src/ipex_llm/transformers/models/baichuan2.py index 1cb4b117..38a47592 100644 --- a/python/llm/src/bigdl/llm/transformers/models/baichuan2.py +++ b/python/llm/src/ipex_llm/transformers/models/baichuan2.py @@ -23,15 +23,15 @@ from typing import Optional, Tuple import torch import torch.utils.checkpoint from torch.nn import functional as F -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ restore_fp8_kv_cache, use_quantize_kv_cache -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \ +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \ append_kv_cache, is_enough_kv_cache_room_4_31 -from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb, SILU -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu -from bigdl.llm.transformers.models.utils import mlp_fusion_check +from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, SILU +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu +from ipex_llm.transformers.models.utils import mlp_fusion_check from transformers.utils import logging logger = logging.get_logger(__name__) diff --git a/python/llm/src/bigdl/llm/transformers/models/bert.py b/python/llm/src/ipex_llm/transformers/models/bert.py similarity index 99% rename from python/llm/src/bigdl/llm/transformers/models/bert.py rename to python/llm/src/ipex_llm/transformers/models/bert.py index 27abd988..4c83ba6c 100644 --- a/python/llm/src/bigdl/llm/transformers/models/bert.py +++ b/python/llm/src/ipex_llm/transformers/models/bert.py @@ -36,7 +36,7 @@ import math import torch from typing import Optional, Tuple from transformers.models.bert.modeling_bert import BertSelfAttention, BertEncoder -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError def merge_qkv(module: torch.nn.Module): diff --git a/python/llm/src/bigdl/llm/transformers/models/bloom.py b/python/llm/src/ipex_llm/transformers/models/bloom.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/models/bloom.py rename to python/llm/src/ipex_llm/transformers/models/bloom.py index 4438270f..46489e8b 100644 --- a/python/llm/src/bigdl/llm/transformers/models/bloom.py +++ b/python/llm/src/ipex_llm/transformers/models/bloom.py @@ -37,8 +37,8 @@ from typing import Optional, Tuple import torch import torch.utils.checkpoint from torch.nn import functional as F -from bigdl.llm.transformers.models.utils import use_fused_layer_norm -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import use_fused_layer_norm +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm.py b/python/llm/src/ipex_llm/transformers/models/chatglm.py similarity index 99% rename from python/llm/src/bigdl/llm/transformers/models/chatglm.py rename to python/llm/src/ipex_llm/transformers/models/chatglm.py index 4adcc722..ac9a98a1 100644 --- a/python/llm/src/bigdl/llm/transformers/models/chatglm.py +++ b/python/llm/src/ipex_llm/transformers/models/chatglm.py @@ -22,7 +22,7 @@ import torch import torch.utils.checkpoint import torch.nn.functional as F from typing import Optional, Tuple -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache def rotate_half(x): diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm2.py b/python/llm/src/ipex_llm/transformers/models/chatglm2.py similarity index 99% rename from python/llm/src/bigdl/llm/transformers/models/chatglm2.py rename to python/llm/src/ipex_llm/transformers/models/chatglm2.py index 1db8424a..c6dae7a9 100644 --- a/python/llm/src/bigdl/llm/transformers/models/chatglm2.py +++ b/python/llm/src/ipex_llm/transformers/models/chatglm2.py @@ -22,10 +22,10 @@ import torch from typing import Optional, Tuple, List import torch.nn.functional as F from transformers.modeling_outputs import BaseModelOutputWithPast -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache -from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ restore_fp8_kv_cache, use_quantize_kv_cache -from bigdl.llm.transformers.models.utils import use_esimd_sdp +from ipex_llm.transformers.models.utils import use_esimd_sdp KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm2_32k.py b/python/llm/src/ipex_llm/transformers/models/chatglm2_32k.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/models/chatglm2_32k.py rename to python/llm/src/ipex_llm/transformers/models/chatglm2_32k.py index d85861f2..94856152 100644 --- a/python/llm/src/bigdl/llm/transformers/models/chatglm2_32k.py +++ b/python/llm/src/ipex_llm/transformers/models/chatglm2_32k.py @@ -20,7 +20,7 @@ import torch from typing import Optional, Tuple, Union, List, Callable, Dict, Any import torch.nn.functional as F -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/decilm.py b/python/llm/src/ipex_llm/transformers/models/decilm.py similarity index 95% rename from python/llm/src/bigdl/llm/transformers/models/decilm.py rename to python/llm/src/ipex_llm/transformers/models/decilm.py index 788f4bab..67bc5e49 100644 --- a/python/llm/src/bigdl/llm/transformers/models/decilm.py +++ b/python/llm/src/ipex_llm/transformers/models/decilm.py @@ -34,12 +34,12 @@ import torch from typing import Optional, Tuple import torch.nn.functional as F -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache -from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ apply_rotary_pos_emb -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu -from bigdl.llm.transformers.models.llama import should_use_fuse_rope, repeat_kv -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu +from ipex_llm.transformers.models.llama import should_use_fuse_rope, repeat_kv +from ipex_llm.utils.common import invalidInputError KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/falcon.py b/python/llm/src/ipex_llm/transformers/models/falcon.py similarity index 99% rename from python/llm/src/bigdl/llm/transformers/models/falcon.py rename to python/llm/src/ipex_llm/transformers/models/falcon.py index aa4abc14..4932aeab 100644 --- a/python/llm/src/bigdl/llm/transformers/models/falcon.py +++ b/python/llm/src/ipex_llm/transformers/models/falcon.py @@ -37,8 +37,8 @@ from typing import Optional, Tuple import torch from torch.nn import functional as F -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache import warnings diff --git a/python/llm/src/bigdl/llm/transformers/models/gemma.py b/python/llm/src/ipex_llm/transformers/models/gemma.py similarity index 96% rename from python/llm/src/bigdl/llm/transformers/models/gemma.py rename to python/llm/src/ipex_llm/transformers/models/gemma.py index 9400c034..26934f03 100644 --- a/python/llm/src/bigdl/llm/transformers/models/gemma.py +++ b/python/llm/src/ipex_llm/transformers/models/gemma.py @@ -35,12 +35,12 @@ from typing import Optional, Tuple import torch from torch import nn -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu -from bigdl.llm.transformers.models.utils import mlp_fusion_check, GELU -from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_36, rotate_half -from bigdl.llm.transformers.low_bit_linear import SYM_INT4, FP8E5 +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu +from ipex_llm.transformers.models.utils import mlp_fusion_check, GELU +from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36, rotate_half +from ipex_llm.transformers.low_bit_linear import SYM_INT4, FP8E5 KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/gptbigcode.py b/python/llm/src/ipex_llm/transformers/models/gptbigcode.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/models/gptbigcode.py rename to python/llm/src/ipex_llm/transformers/models/gptbigcode.py index 8a38f22e..611b9fba 100644 --- a/python/llm/src/bigdl/llm/transformers/models/gptbigcode.py +++ b/python/llm/src/ipex_llm/transformers/models/gptbigcode.py @@ -46,7 +46,7 @@ def gptbigcode_attention_forward( if encoder_hidden_states is not None: if not hasattr(self, "q_attn") or not self.is_cross_attention: - from bigdl.llm.utils.common import invalidInputError + from ipex_llm.utils.common import invalidInputError invalidInputError( False, "If class is used as cross attention," + diff --git a/python/llm/src/bigdl/llm/transformers/models/gptj.py b/python/llm/src/ipex_llm/transformers/models/gptj.py similarity index 99% rename from python/llm/src/bigdl/llm/transformers/models/gptj.py rename to python/llm/src/ipex_llm/transformers/models/gptj.py index 794cf291..38df3cb1 100644 --- a/python/llm/src/bigdl/llm/transformers/models/gptj.py +++ b/python/llm/src/ipex_llm/transformers/models/gptj.py @@ -19,12 +19,12 @@ import torch from typing import Optional, Tuple, Union -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \ +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \ apply_rotary_pos_emb, append_kv_cache, apply_ipex_rotate_every_two from transformers.utils.import_utils import is_torch_fx_proxy from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.models.gptj.modeling_gptj import GPTJModel -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/gptneox.py b/python/llm/src/ipex_llm/transformers/models/gptneox.py similarity index 96% rename from python/llm/src/bigdl/llm/transformers/models/gptneox.py rename to python/llm/src/ipex_llm/transformers/models/gptneox.py index ca29845a..52466042 100644 --- a/python/llm/src/bigdl/llm/transformers/models/gptneox.py +++ b/python/llm/src/ipex_llm/transformers/models/gptneox.py @@ -33,10 +33,10 @@ import torch from typing import Optional, Tuple -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \ +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \ append_kv_cache, is_enough_kv_cache_room_4_31 -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/internlm.py b/python/llm/src/ipex_llm/transformers/models/internlm.py similarity index 97% rename from python/llm/src/bigdl/llm/transformers/models/internlm.py rename to python/llm/src/ipex_llm/transformers/models/internlm.py index e75e67d4..038a63d8 100644 --- a/python/llm/src/bigdl/llm/transformers/models/internlm.py +++ b/python/llm/src/ipex_llm/transformers/models/internlm.py @@ -42,11 +42,11 @@ from typing import Optional, Tuple import torch import torch.utils.checkpoint from torch import nn -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \ +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \ append_kv_cache, is_enough_kv_cache_room_4_31 -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py similarity index 99% rename from python/llm/src/bigdl/llm/transformers/models/llama.py rename to python/llm/src/ipex_llm/transformers/models/llama.py index 3646253c..45d944c5 100644 --- a/python/llm/src/bigdl/llm/transformers/models/llama.py +++ b/python/llm/src/ipex_llm/transformers/models/llama.py @@ -39,20 +39,20 @@ from typing import Optional, Tuple, Union, List import math import os import torch.nn.functional as F -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache -from bigdl.llm.transformers.models.utils import SILU -from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import SILU +from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ restore_fp8_kv_cache, use_quantize_kv_cache -from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ +from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ apply_rotary_pos_emb, is_enough_kv_cache_room_4_36 -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu -from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp -from bigdl.llm.transformers.models.utils import mlp_fusion_check, fp16_fusion_check +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu +from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp +from ipex_llm.transformers.models.utils import mlp_fusion_check, fp16_fusion_check from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.models.llama.modeling_llama import LlamaModel -from bigdl.llm.transformers.low_bit_linear import SYM_INT4, FP8E5, IQ2_XXS -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.transformers.low_bit_linear import SYM_INT4, FP8E5, IQ2_XXS +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.utils.common import invalidInputError try: from transformers.cache_utils import Cache, DynamicCache @@ -106,7 +106,7 @@ def llama_model_forward_4_36( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: - from bigdl.llm.transformers.kv import DynamicFp8Cache + from ipex_llm.transformers.kv import DynamicFp8Cache use_cache = use_cache if use_cache is not None else self.config.use_cache if use_cache and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids): if not isinstance(past_key_values, DynamicFp8Cache): @@ -1558,7 +1558,7 @@ def llama_attention_fast_forward( kv_seq_len += past_key_value[0].shape[-2] if use_fast_rope: - from bigdl.llm.transformers.layers.rope_embedding import apply_fast_rope_embedding + from ipex_llm.transformers.layers.rope_embedding import apply_fast_rope_embedding query_states, key_states = apply_fast_rope_embedding(query_states, key_states, position_ids, diff --git a/python/llm/src/bigdl/llm/transformers/models/mistral.py b/python/llm/src/ipex_llm/transformers/models/mistral.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/models/mistral.py rename to python/llm/src/ipex_llm/transformers/models/mistral.py index 3db06cfe..5c7a6343 100644 --- a/python/llm/src/bigdl/llm/transformers/models/mistral.py +++ b/python/llm/src/ipex_llm/transformers/models/mistral.py @@ -43,16 +43,16 @@ from torch import nn import torch.nn.functional as F from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.models.mistral.modeling_mistral import MistralModel -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache -from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ restore_fp8_kv_cache, use_quantize_kv_cache -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb, \ +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, \ apply_rotary_pos_emb_no_cache_xpu -from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ +from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ is_enough_kv_cache_room_4_36 -from bigdl.llm.transformers.low_bit_linear import SYM_INT4, FP8E5, IQ2_XXS -from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp +from ipex_llm.transformers.low_bit_linear import SYM_INT4, FP8E5, IQ2_XXS +from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp try: from transformers.cache_utils import Cache except ImportError: @@ -138,7 +138,7 @@ def mistral_model_forward_4_36( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: - from bigdl.llm.transformers.kv import DynamicFp8Cache + from ipex_llm.transformers.kv import DynamicFp8Cache use_cache = use_cache if use_cache is not None else self.config.use_cache if use_cache and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids): if not isinstance(past_key_values, DynamicFp8Cache): diff --git a/python/llm/src/bigdl/llm/transformers/models/mixtral.py b/python/llm/src/ipex_llm/transformers/models/mixtral.py similarity index 97% rename from python/llm/src/bigdl/llm/transformers/models/mixtral.py rename to python/llm/src/ipex_llm/transformers/models/mixtral.py index 0e31e238..f5c836ac 100644 --- a/python/llm/src/bigdl/llm/transformers/models/mixtral.py +++ b/python/llm/src/ipex_llm/transformers/models/mixtral.py @@ -48,15 +48,15 @@ from transformers.modeling_attn_mask_utils import ( import torch from torch import nn import torch.nn.functional as F -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb,\ +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb,\ apply_rotary_pos_emb_cache_freq_xpu, is_enough_kv_cache_room_4_36 -from bigdl.llm.transformers.models.mistral import should_use_fuse_rope, use_decoding_fast_path -from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp -from bigdl.llm.transformers.models.utils import mlp_fusion_check, SILU -from bigdl.llm.transformers.low_bit_linear import IQ2_XXS +from ipex_llm.transformers.models.mistral import should_use_fuse_rope, use_decoding_fast_path +from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp +from ipex_llm.transformers.models.utils import mlp_fusion_check, SILU +from ipex_llm.transformers.low_bit_linear import IQ2_XXS KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/mpt.py b/python/llm/src/ipex_llm/transformers/models/mpt.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/models/mpt.py rename to python/llm/src/ipex_llm/transformers/models/mpt.py index a09ef771..4d4a191a 100644 --- a/python/llm/src/bigdl/llm/transformers/models/mpt.py +++ b/python/llm/src/ipex_llm/transformers/models/mpt.py @@ -22,8 +22,8 @@ import torch from einops import rearrange import math import torch.nn.functional as F -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/phixtral.py b/python/llm/src/ipex_llm/transformers/models/phixtral.py similarity index 91% rename from python/llm/src/bigdl/llm/transformers/models/phixtral.py rename to python/llm/src/ipex_llm/transformers/models/phixtral.py index 272ab53b..66595d5c 100644 --- a/python/llm/src/bigdl/llm/transformers/models/phixtral.py +++ b/python/llm/src/ipex_llm/transformers/models/phixtral.py @@ -43,14 +43,14 @@ from typing import Optional, Tuple import torch from torch import nn import torch.nn.functional as F -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb,\ +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb,\ apply_rotary_pos_emb_no_cache_xpu, is_enough_kv_cache_room_4_36 -from bigdl.llm.transformers.models.mistral import should_use_fuse_rope, use_decoding_fast_path -from bigdl.llm.transformers.models.utils import use_flash_attention -from bigdl.llm.transformers.models.utils import mlp_fusion_check +from ipex_llm.transformers.models.mistral import should_use_fuse_rope, use_decoding_fast_path +from ipex_llm.transformers.models.utils import use_flash_attention +from ipex_llm.transformers.models.utils import mlp_fusion_check KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/qwen.py b/python/llm/src/ipex_llm/transformers/models/qwen.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/models/qwen.py rename to python/llm/src/ipex_llm/transformers/models/qwen.py index 321a307e..833ff866 100644 --- a/python/llm/src/bigdl/llm/transformers/models/qwen.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen.py @@ -36,15 +36,15 @@ try: except ImportError: rearrange = None -from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache -from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ +from ipex_llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ restore_fp8_kv_cache, use_quantize_kv_cache -from bigdl.llm.transformers.models.utils import rotate_half, SILU -from bigdl.llm.transformers.models.utils import mlp_fusion_check -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu -from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp -from bigdl.llm.utils.common import invalidInputError, invalidOperationError -from bigdl.llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.transformers.models.utils import rotate_half, SILU +from ipex_llm.transformers.models.utils import mlp_fusion_check +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu +from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp +from ipex_llm.utils.common import invalidInputError, invalidOperationError +from ipex_llm.ggml.quantize import ggml_tensor_qtype from transformers.modeling_outputs import BaseModelOutputWithPast apply_rotary_emb_func = None diff --git a/python/llm/src/bigdl/llm/transformers/models/qwen2.py b/python/llm/src/ipex_llm/transformers/models/qwen2.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/models/qwen2.py rename to python/llm/src/ipex_llm/transformers/models/qwen2.py index 81864db0..faf14a87 100644 --- a/python/llm/src/bigdl/llm/transformers/models/qwen2.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen2.py @@ -45,14 +45,14 @@ import torch import torch.nn as nn import torch.nn.functional as F -from bigdl.llm.transformers.models.llama import repeat_kv -from bigdl.llm.transformers.models.utils import extend_kv_cache, append_kv_cache -from bigdl.llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache -from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_36 -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu -from bigdl.llm.transformers.kv import DynamicFp8Cache -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp +from ipex_llm.transformers.models.llama import repeat_kv +from ipex_llm.transformers.models.utils import extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache +from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36 +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu +from ipex_llm.transformers.kv import DynamicFp8Cache +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp from transformers.models.qwen2.modeling_qwen2 import Qwen2Model, apply_rotary_pos_emb from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask_for_sdpa from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask @@ -404,7 +404,7 @@ def qwen2_attention_forward_quantized( attn_weights = None return attn_output, attn_weights, past_key_value -from bigdl.llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.ggml.quantize import ggml_tensor_qtype SYM_INT4 = ggml_tensor_qtype["sym_int4"] FP8E5 = ggml_tensor_qtype["fp8_e5m2"] diff --git a/python/llm/src/bigdl/llm/transformers/models/qwen_vl.py b/python/llm/src/ipex_llm/transformers/models/qwen_vl.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/models/qwen_vl.py rename to python/llm/src/ipex_llm/transformers/models/qwen_vl.py index 4094ae14..7c66f9ea 100644 --- a/python/llm/src/bigdl/llm/transformers/models/qwen_vl.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen_vl.py @@ -30,8 +30,8 @@ import torch import torch.nn.functional as F import torch.utils.checkpoint from transformers.utils import logging -from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache -from bigdl.llm.transformers.models.utils import rotate_half +from ipex_llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import rotate_half KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/models/rwkv4.py b/python/llm/src/ipex_llm/transformers/models/rwkv4.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/models/rwkv4.py rename to python/llm/src/ipex_llm/transformers/models/rwkv4.py diff --git a/python/llm/src/bigdl/llm/transformers/models/rwkv5.py b/python/llm/src/ipex_llm/transformers/models/rwkv5.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/models/rwkv5.py rename to python/llm/src/ipex_llm/transformers/models/rwkv5.py diff --git a/python/llm/src/bigdl/llm/transformers/models/utils.py b/python/llm/src/ipex_llm/transformers/models/utils.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/models/utils.py rename to python/llm/src/ipex_llm/transformers/models/utils.py index 40230638..1a4e1f0b 100644 --- a/python/llm/src/bigdl/llm/transformers/models/utils.py +++ b/python/llm/src/ipex_llm/transformers/models/utils.py @@ -16,9 +16,9 @@ import os import torch -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.ggml.quantize import ggml_tensor_qtype -from bigdl.llm.transformers.utils import get_ipex_version, get_xpu_device_type +from ipex_llm.utils.common import invalidInputError +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_type FP8_KV_ALLOC_LENGTH = 512 SYM_INT4 = ggml_tensor_qtype["sym_int4"] @@ -177,7 +177,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family): def apply_ipex_rotate_every_two(q, k, cos, sin): # ipex's apply_rotary_embedding_two_qk can change the origin storage, # so q/k will get the result directly. - from bigdl.llm.transformers.utils import get_ipex_version + from ipex_llm.transformers.utils import get_ipex_version if get_ipex_version() >= "2.1.10+xpu": torch.ops.torch_ipex.apply_rotary_embedding_two_qk( q, k, sin, cos, q, k diff --git a/python/llm/src/bigdl/llm/transformers/models/yuan.py b/python/llm/src/ipex_llm/transformers/models/yuan.py similarity index 97% rename from python/llm/src/bigdl/llm/transformers/models/yuan.py rename to python/llm/src/ipex_llm/transformers/models/yuan.py index 015835d7..f17b0ec7 100644 --- a/python/llm/src/bigdl/llm/transformers/models/yuan.py +++ b/python/llm/src/ipex_llm/transformers/models/yuan.py @@ -28,14 +28,14 @@ from typing import Optional, Tuple import torch import torch.nn as nn -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb, \ +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, \ apply_rotary_pos_emb_cache_freq_xpu, mlp_fusion_check, fp16_fusion_check -from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache -from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ restore_fp8_kv_cache, use_quantize_kv_cache -from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_31, SILU -from bigdl.llm.transformers.low_bit_linear import SYM_INT4, FP8E5 +from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, SILU +from ipex_llm.transformers.low_bit_linear import SYM_INT4, FP8E5 KV_CACHE_ALLOC_BLOCK_LENGTH = 256 diff --git a/python/llm/src/bigdl/llm/transformers/qlora.py b/python/llm/src/ipex_llm/transformers/qlora.py similarity index 95% rename from python/llm/src/bigdl/llm/transformers/qlora.py rename to python/llm/src/ipex_llm/transformers/qlora.py index 852f59e5..5c9f3b54 100644 --- a/python/llm/src/bigdl/llm/transformers/qlora.py +++ b/python/llm/src/ipex_llm/transformers/qlora.py @@ -51,15 +51,15 @@ import torch import logging from torch.nn import Linear, Embedding -from bigdl.llm.transformers.low_bit_linear import LowBitLinear, BF16Linear, get_qk_size +from ipex_llm.transformers.low_bit_linear import LowBitLinear, BF16Linear, get_qk_size from peft.tuners.lora import LoraLayer -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.transformers.utils import get_autocast_dtype -from bigdl.llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.utils import get_autocast_dtype +from ipex_llm.ggml.quantize import ggml_tensor_qtype import functools -from bigdl.llm.transformers import training_patch +from ipex_llm.transformers import training_patch -LOG = logging.getLogger("bigdl.llm.qlora") +LOG = logging.getLogger("ipex_llm.qlora") class LoraLowBitLinear(LowBitLinear, LoraLayer): @@ -246,7 +246,7 @@ class LoraConfig(LoraConfigBase): def __init__(self, *args, **kwargs): self.training_mode = kwargs.pop("training_mode", "qlora") super().__init__(*args, **kwargs) - from bigdl.llm.llm_patching import bigdl_patched + from ipex_llm.llm_patching import bigdl_patched if bigdl_patched == 'Train': from .model import patched_training_mode self.training_mode = patched_training_mode @@ -274,7 +274,7 @@ def get_peft_model(*args, **kwargs): old_create_new_module)) try: - from bigdl.llm.llm_patching import bigdl_patched + from ipex_llm.llm_patching import bigdl_patched if bigdl_patched == 'Train': from peft import get_peft_model_original else: @@ -384,8 +384,8 @@ def cast_lora_weight(model, dtype=torch.bfloat16): def _optimize_post(model): import transformers from packaging import version - from bigdl.llm.transformers.convert import convert_forward - from bigdl.llm.transformers.models.llama import llama_attention_fast_forward + from ipex_llm.transformers.convert import convert_forward + from ipex_llm.transformers.models.llama import llama_attention_fast_forward trans_version = transformers.__version__ if version.parse(trans_version) >= version.parse("4.31.0"): diff --git a/python/llm/src/bigdl/llm/transformers/relora.py b/python/llm/src/ipex_llm/transformers/relora.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/relora.py rename to python/llm/src/ipex_llm/transformers/relora.py index e1c37a0c..37676f70 100644 --- a/python/llm/src/bigdl/llm/transformers/relora.py +++ b/python/llm/src/ipex_llm/transformers/relora.py @@ -54,11 +54,11 @@ from transformers import ( ) from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR import torch.distributed as dist -from bigdl.llm.transformers.qlora import LoraLowBitLinear -from bigdl.llm.transformers.low_bit_linear import FP4Params -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.transformers.qlora import LoraLowBitLinear +from ipex_llm.transformers.low_bit_linear import FP4Params +from ipex_llm.utils.common import invalidInputError -LOG = logging.getLogger("bigdl.llm.relora") +LOG = logging.getLogger("ipex_llm.relora") class ReLoRATrainer(Trainer): diff --git a/python/llm/src/bigdl/llm/transformers/speculative.py b/python/llm/src/ipex_llm/transformers/speculative.py similarity index 99% rename from python/llm/src/bigdl/llm/transformers/speculative.py rename to python/llm/src/ipex_llm/transformers/speculative.py index 323a1d9c..74866fee 100644 --- a/python/llm/src/bigdl/llm/transformers/speculative.py +++ b/python/llm/src/ipex_llm/transformers/speculative.py @@ -29,14 +29,14 @@ from packaging import version from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union from transformers import top_k_top_p_filtering, GenerationConfig, \ LogitsProcessorList, StoppingCriteriaList -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError from transformers.modeling_outputs import CausalLMOutputWithPast # patch GenerationMixin.generate from transformers import GenerationMixin original_generate = GenerationMixin.generate query_group_size = 16 -logger = logging.getLogger("bigdl.llm.speculative") +logger = logging.getLogger("ipex_llm.speculative") @torch.no_grad() @@ -370,7 +370,7 @@ def _update_past_key_values_storage_cpu(self, past_key_values, past_key_values_s def _check_and_extend_kv_cache(past_key_values, max_step_draft, kv_alloc_block_len=256, model_type="llama"): - from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ + from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ extend_kv_cache enough_kv_room = True if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral", @@ -534,7 +534,7 @@ def speculative_generate(self, past_key_values = None past_key_values_storage = [] - from bigdl.llm.transformers.convert import get_enable_ipex + from ipex_llm.transformers.convert import get_enable_ipex _enable_ipex = get_enable_ipex() if _enable_ipex: diff --git a/python/llm/src/bigdl/llm/transformers/training_patch.py b/python/llm/src/ipex_llm/transformers/training_patch.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/training_patch.py rename to python/llm/src/ipex_llm/transformers/training_patch.py diff --git a/python/llm/src/bigdl/llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py similarity index 99% rename from python/llm/src/bigdl/llm/transformers/utils.py rename to python/llm/src/ipex_llm/transformers/utils.py index ba6fba9b..39211fd4 100644 --- a/python/llm/src/bigdl/llm/transformers/utils.py +++ b/python/llm/src/ipex_llm/transformers/utils.py @@ -41,7 +41,7 @@ # SOFTWARE. import os from transformers.modeling_utils import _add_variant -from bigdl.llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.ggml.quantize import ggml_tensor_qtype from ..utils.common import invalidInputError from typing import Union, Optional import torch diff --git a/python/llm/src/bigdl/llm/transformers/xpu_customize_fwd.py b/python/llm/src/ipex_llm/transformers/xpu_customize_fwd.py similarity index 100% rename from python/llm/src/bigdl/llm/transformers/xpu_customize_fwd.py rename to python/llm/src/ipex_llm/transformers/xpu_customize_fwd.py diff --git a/python/llm/src/bigdl/llm/utils/__init__.py b/python/llm/src/ipex_llm/utils/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/utils/__init__.py rename to python/llm/src/ipex_llm/utils/__init__.py diff --git a/python/llm/src/bigdl/llm/utils/common/__init__.py b/python/llm/src/ipex_llm/utils/common/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/utils/common/__init__.py rename to python/llm/src/ipex_llm/utils/common/__init__.py diff --git a/python/llm/src/bigdl/llm/utils/common/lazyimport.py b/python/llm/src/ipex_llm/utils/common/lazyimport.py similarity index 95% rename from python/llm/src/bigdl/llm/utils/common/lazyimport.py rename to python/llm/src/ipex_llm/utils/common/lazyimport.py index 6380831d..ac76dc01 100644 --- a/python/llm/src/bigdl/llm/utils/common/lazyimport.py +++ b/python/llm/src/ipex_llm/utils/common/lazyimport.py @@ -25,8 +25,8 @@ class LazyImport: Lazy import python module until use. Example: - >>> from bigdl.llm.utils.common import LazyImport - >>> _convert_to_ggml = LazyImport('bigdl.llm.ggml.convert._convert_to_ggml') + >>> from ipex_llm.utils.common import LazyImport + >>> _convert_to_ggml = LazyImport('ipex_llm.ggml.convert._convert_to_ggml') >>> _convert_to_ggml(model_path, outfile_dir) """ def __init__(self, module_name: str): diff --git a/python/llm/src/bigdl/llm/utils/common/log4Error.py b/python/llm/src/ipex_llm/utils/common/log4Error.py similarity index 100% rename from python/llm/src/bigdl/llm/utils/common/log4Error.py rename to python/llm/src/ipex_llm/utils/common/log4Error.py diff --git a/python/llm/src/bigdl/llm/utils/convert_chatglm.py b/python/llm/src/ipex_llm/utils/convert_chatglm.py similarity index 99% rename from python/llm/src/bigdl/llm/utils/convert_chatglm.py rename to python/llm/src/ipex_llm/utils/convert_chatglm.py index 466287a3..d25e569c 100644 --- a/python/llm/src/bigdl/llm/utils/convert_chatglm.py +++ b/python/llm/src/ipex_llm/utils/convert_chatglm.py @@ -55,7 +55,7 @@ import torch from tabulate import tabulate from tqdm import tqdm from transformers import AutoModel, AutoTokenizer -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError GGML_QK8_0 = 64 GGML_QK4_0 = 64 diff --git a/python/llm/src/bigdl/llm/utils/convert_util.py b/python/llm/src/ipex_llm/utils/convert_util.py similarity index 99% rename from python/llm/src/bigdl/llm/utils/convert_util.py rename to python/llm/src/ipex_llm/utils/convert_util.py index 8a9e5059..5570f3c9 100644 --- a/python/llm/src/bigdl/llm/utils/convert_util.py +++ b/python/llm/src/ipex_llm/utils/convert_util.py @@ -65,7 +65,7 @@ from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) import numpy as np from sentencepiece import SentencePieceProcessor -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError import os from pathlib import Path @@ -1782,7 +1782,7 @@ def _convert_chatglm_hf_to_ggml(model_path, outfile_dir, outtype): invalidInputError(outtype in ["q4_0", "q4_1"], "For now we only support quantization type 'q4_0' and 'q4_1' " "in chatglm family.") - from bigdl.llm.utils.convert_chatglm import _convert_chatglm_hf_to_ggml_ + from ipex_llm.utils.convert_chatglm import _convert_chatglm_hf_to_ggml_ return _convert_chatglm_hf_to_ggml_(model_path, outfile, outtype) diff --git a/python/llm/src/bigdl/llm/utils/glibc_checker.py b/python/llm/src/ipex_llm/utils/glibc_checker.py similarity index 97% rename from python/llm/src/bigdl/llm/utils/glibc_checker.py rename to python/llm/src/ipex_llm/utils/glibc_checker.py index 4368ee1b..21d9edc4 100644 --- a/python/llm/src/bigdl/llm/utils/glibc_checker.py +++ b/python/llm/src/ipex_llm/utils/glibc_checker.py @@ -18,7 +18,7 @@ import os import platform from packaging import version from importlib.metadata import distribution, PackageNotFoundError -from bigdl.llm.utils.common import log4Error +from ipex_llm.utils.common import log4Error class GlibcChecker: diff --git a/python/llm/src/bigdl/llm/utils/ipex_importer.py b/python/llm/src/ipex_llm/utils/ipex_importer.py similarity index 100% rename from python/llm/src/bigdl/llm/utils/ipex_importer.py rename to python/llm/src/ipex_llm/utils/ipex_importer.py diff --git a/python/llm/src/bigdl/llm/utils/isa_checker.py b/python/llm/src/ipex_llm/utils/isa_checker.py similarity index 100% rename from python/llm/src/bigdl/llm/utils/isa_checker.py rename to python/llm/src/ipex_llm/utils/isa_checker.py diff --git a/python/llm/src/bigdl/llm/utils/lazy_load_torch.py b/python/llm/src/ipex_llm/utils/lazy_load_torch.py similarity index 100% rename from python/llm/src/bigdl/llm/utils/lazy_load_torch.py rename to python/llm/src/ipex_llm/utils/lazy_load_torch.py diff --git a/python/llm/src/bigdl/llm/utils/utils.py b/python/llm/src/ipex_llm/utils/utils.py similarity index 95% rename from python/llm/src/bigdl/llm/utils/utils.py rename to python/llm/src/ipex_llm/utils/utils.py index 974bf4a1..5b906632 100644 --- a/python/llm/src/bigdl/llm/utils/utils.py +++ b/python/llm/src/ipex_llm/utils/utils.py @@ -16,7 +16,7 @@ import sys import pathlib -from bigdl.llm.utils.common import invalidInputError, invalidOperationError +from ipex_llm.utils.common import invalidInputError, invalidOperationError def get_shared_lib_info(lib_base_name: str): diff --git a/python/llm/src/bigdl/llm/vllm/config.py b/python/llm/src/ipex_llm/vllm/config.py similarity index 99% rename from python/llm/src/bigdl/llm/vllm/config.py rename to python/llm/src/ipex_llm/vllm/config.py index 11c999c8..c386a90e 100644 --- a/python/llm/src/bigdl/llm/vllm/config.py +++ b/python/llm/src/ipex_llm/vllm/config.py @@ -34,8 +34,8 @@ from typing import Optional import torch from transformers import AutoConfig, PretrainedConfig -from bigdl.llm.vllm.logger import init_logger -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.vllm.logger import init_logger +from ipex_llm.utils.common import invalidInputError logger = init_logger(__name__) diff --git a/python/llm/src/bigdl/llm/vllm/core/policy.py b/python/llm/src/ipex_llm/vllm/core/policy.py similarity index 95% rename from python/llm/src/bigdl/llm/vllm/core/policy.py rename to python/llm/src/ipex_llm/vllm/core/policy.py index 9a468a6d..2ec56693 100644 --- a/python/llm/src/bigdl/llm/vllm/core/policy.py +++ b/python/llm/src/ipex_llm/vllm/core/policy.py @@ -33,8 +33,8 @@ from typing import List -from bigdl.llm.vllm.sequence import SequenceGroup -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.vllm.sequence import SequenceGroup +from ipex_llm.utils.common import invalidInputError class Policy: diff --git a/python/llm/src/bigdl/llm/vllm/core/scheduler.py b/python/llm/src/ipex_llm/vllm/core/scheduler.py similarity index 98% rename from python/llm/src/bigdl/llm/vllm/core/scheduler.py rename to python/llm/src/ipex_llm/vllm/core/scheduler.py index b41ea166..0667d4e9 100644 --- a/python/llm/src/bigdl/llm/vllm/core/scheduler.py +++ b/python/llm/src/ipex_llm/vllm/core/scheduler.py @@ -38,13 +38,13 @@ import enum import time from typing import Dict, Iterable, List, Optional, Tuple, Union -from bigdl.llm.vllm.config import SchedulerConfig -from bigdl.llm.vllm.core.policy import PolicyFactory -from bigdl.llm.vllm.logger import init_logger -from bigdl.llm.vllm.sequence import SequenceData, SequenceStatus -from bigdl.llm.vllm.sequence import (Sequence, SequenceGroup, +from ipex_llm.vllm.config import SchedulerConfig +from ipex_llm.vllm.core.policy import PolicyFactory +from ipex_llm.vllm.logger import init_logger +from ipex_llm.vllm.sequence import SequenceData, SequenceStatus +from ipex_llm.vllm.sequence import (Sequence, SequenceGroup, SequenceGroupMetadata) -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError logger = init_logger(__name__) diff --git a/python/llm/src/bigdl/llm/vllm/engine/__init__.py b/python/llm/src/ipex_llm/vllm/engine/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/vllm/engine/__init__.py rename to python/llm/src/ipex_llm/vllm/engine/__init__.py diff --git a/python/llm/src/bigdl/llm/vllm/engine/arg_utils.py b/python/llm/src/ipex_llm/vllm/engine/arg_utils.py similarity index 99% rename from python/llm/src/bigdl/llm/vllm/engine/arg_utils.py rename to python/llm/src/ipex_llm/vllm/engine/arg_utils.py index dc9857b1..5e6d4357 100644 --- a/python/llm/src/bigdl/llm/vllm/engine/arg_utils.py +++ b/python/llm/src/ipex_llm/vllm/engine/arg_utils.py @@ -38,7 +38,7 @@ import argparse import dataclasses from dataclasses import dataclass from typing import Optional, Tuple -from bigdl.llm.vllm.config import ModelConfig, SchedulerConfig +from ipex_llm.vllm.config import ModelConfig, SchedulerConfig @dataclass diff --git a/python/llm/src/bigdl/llm/vllm/engine/async_llm_engine.py b/python/llm/src/ipex_llm/vllm/engine/async_llm_engine.py similarity index 98% rename from python/llm/src/bigdl/llm/vllm/engine/async_llm_engine.py rename to python/llm/src/ipex_llm/vllm/engine/async_llm_engine.py index 46eaf741..119725cf 100644 --- a/python/llm/src/bigdl/llm/vllm/engine/async_llm_engine.py +++ b/python/llm/src/ipex_llm/vllm/engine/async_llm_engine.py @@ -39,13 +39,13 @@ import time from functools import partial from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union) -from bigdl.llm.vllm.config import ModelConfig -from bigdl.llm.vllm.engine.arg_utils import AsyncEngineArgs -from bigdl.llm.vllm.engine.llm_engine import LLMEngine -from bigdl.llm.vllm.logger import init_logger -from bigdl.llm.vllm.outputs import RequestOutput -from bigdl.llm.vllm.sampling_params import SamplingParams -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.vllm.config import ModelConfig +from ipex_llm.vllm.engine.arg_utils import AsyncEngineArgs +from ipex_llm.vllm.engine.llm_engine import LLMEngine +from ipex_llm.vllm.logger import init_logger +from ipex_llm.vllm.outputs import RequestOutput +from ipex_llm.vllm.sampling_params import SamplingParams +from ipex_llm.utils.common import invalidInputError logger = init_logger(__name__) diff --git a/python/llm/src/bigdl/llm/vllm/engine/llm_engine.py b/python/llm/src/ipex_llm/vllm/engine/llm_engine.py similarity index 97% rename from python/llm/src/bigdl/llm/vllm/engine/llm_engine.py rename to python/llm/src/ipex_llm/vllm/engine/llm_engine.py index d56ed482..b8575e72 100644 --- a/python/llm/src/bigdl/llm/vllm/engine/llm_engine.py +++ b/python/llm/src/ipex_llm/vllm/engine/llm_engine.py @@ -38,13 +38,13 @@ import time from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union, Dict -from bigdl.llm.vllm.config import ModelConfig, SchedulerConfig -from bigdl.llm.vllm.core.scheduler import SchedulerOutputs, FixedWindowScheduler -from bigdl.llm.vllm.engine.arg_utils import EngineArgs -from bigdl.llm.vllm.logger import init_logger -from bigdl.llm.vllm.outputs import RequestOutput -from bigdl.llm.vllm.sampling_params import SamplingParams -from bigdl.llm.vllm.sequence import ( +from ipex_llm.vllm.config import ModelConfig, SchedulerConfig +from ipex_llm.vllm.core.scheduler import SchedulerOutputs, FixedWindowScheduler +from ipex_llm.vllm.engine.arg_utils import EngineArgs +from ipex_llm.vllm.logger import init_logger +from ipex_llm.vllm.outputs import RequestOutput +from ipex_llm.vllm.sampling_params import SamplingParams +from ipex_llm.vllm.sequence import ( SamplerOutput, Sequence, SequenceGroup, @@ -52,11 +52,11 @@ from bigdl.llm.vllm.sequence import ( SequenceStatus, SequenceOutputs, ) -from bigdl.llm.vllm.transformers_utils.tokenizer import get_tokenizer, detokenize_incrementally -from bigdl.llm.vllm.utils import ( +from ipex_llm.vllm.transformers_utils.tokenizer import get_tokenizer, detokenize_incrementally +from ipex_llm.vllm.utils import ( Counter, ) -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError logger = init_logger(__name__) @@ -158,7 +158,7 @@ class LLMEngine: def _init_workers(self): # Lazy import the Worker to avoid importing torch.cuda/xformers # before CUDA_VISIBLE_DEVICES is set in the Worker - from bigdl.llm.vllm.worker.worker import ( + from ipex_llm.vllm.worker.worker import ( Worker, ) # pylint: disable=import-outside-toplevel diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/__init__.py b/python/llm/src/ipex_llm/vllm/entrypoints/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/vllm/entrypoints/__init__.py rename to python/llm/src/ipex_llm/vllm/entrypoints/__init__.py diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/api_server.py b/python/llm/src/ipex_llm/vllm/entrypoints/api_server.py similarity index 94% rename from python/llm/src/bigdl/llm/vllm/entrypoints/api_server.py rename to python/llm/src/ipex_llm/vllm/entrypoints/api_server.py index 6dcdf7ec..e636435e 100644 --- a/python/llm/src/bigdl/llm/vllm/entrypoints/api_server.py +++ b/python/llm/src/ipex_llm/vllm/entrypoints/api_server.py @@ -39,10 +39,10 @@ from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse import uvicorn -from bigdl.llm.vllm.engine.arg_utils import AsyncEngineArgs -from bigdl.llm.vllm.engine.async_llm_engine import AsyncLLMEngine -from bigdl.llm.vllm.sampling_params import SamplingParams -from bigdl.llm.vllm.utils import random_uuid +from ipex_llm.vllm.engine.arg_utils import AsyncEngineArgs +from ipex_llm.vllm.engine.async_llm_engine import AsyncLLMEngine +from ipex_llm.vllm.sampling_params import SamplingParams +from ipex_llm.vllm.utils import random_uuid TIMEOUT_KEEP_ALIVE = 5 # seconds. TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds. diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/llm.py b/python/llm/src/ipex_llm/vllm/entrypoints/llm.py similarity index 97% rename from python/llm/src/bigdl/llm/vllm/entrypoints/llm.py rename to python/llm/src/ipex_llm/vllm/entrypoints/llm.py index 8ed3c790..66942f02 100644 --- a/python/llm/src/bigdl/llm/vllm/entrypoints/llm.py +++ b/python/llm/src/ipex_llm/vllm/entrypoints/llm.py @@ -38,12 +38,12 @@ from typing import List, Optional, Union from tqdm import tqdm from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -from bigdl.llm.vllm.engine.arg_utils import EngineArgs -from bigdl.llm.vllm.engine.llm_engine import LLMEngine +from ipex_llm.vllm.engine.arg_utils import EngineArgs +from ipex_llm.vllm.engine.llm_engine import LLMEngine -from bigdl.llm.vllm.outputs import RequestOutput -from bigdl.llm.vllm.sampling_params import SamplingParams -from bigdl.llm.vllm.utils import Counter +from ipex_llm.vllm.outputs import RequestOutput +from ipex_llm.vllm.sampling_params import SamplingParams +from ipex_llm.vllm.utils import Counter class LLM: diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/__init__.py b/python/llm/src/ipex_llm/vllm/entrypoints/openai/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/vllm/entrypoints/openai/__init__.py rename to python/llm/src/ipex_llm/vllm/entrypoints/openai/__init__.py diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/entrypoints/openai/api_server.py similarity index 98% rename from python/llm/src/bigdl/llm/vllm/entrypoints/openai/api_server.py rename to python/llm/src/ipex_llm/vllm/entrypoints/openai/api_server.py index 27d904fc..9e1c2011 100644 --- a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/api_server.py +++ b/python/llm/src/ipex_llm/vllm/entrypoints/openai/api_server.py @@ -52,24 +52,24 @@ from fastapi.responses import JSONResponse, StreamingResponse from packaging import version import numpy as np -from bigdl.llm.vllm.engine.arg_utils import AsyncEngineArgs -from bigdl.llm.vllm.engine.async_llm_engine import AsyncLLMEngine -from bigdl.llm.vllm.entrypoints.openai.protocol import ( +from ipex_llm.vllm.engine.arg_utils import AsyncEngineArgs +from ipex_llm.vllm.engine.async_llm_engine import AsyncLLMEngine +from ipex_llm.vllm.entrypoints.openai.protocol import ( CompletionResponse, CompletionResponseChoice, CompletionResponseStreamChoice, CompletionStreamResponse, ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, DeltaMessage, ErrorResponse, LogProbs, ModelCard, ModelPermission, UsageInfo) -from bigdl.llm.vllm.entrypoints.openai.openai_protocol import ( +from ipex_llm.vllm.entrypoints.openai.openai_protocol import ( CompletionRequest, ChatCompletionRequest, ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ModelList) -from bigdl.llm.vllm.logger import init_logger -from bigdl.llm.vllm.outputs import RequestOutput -from bigdl.llm.vllm.sampling_params import SamplingParams -from bigdl.llm.vllm.transformers_utils.tokenizer import get_tokenizer +from ipex_llm.vllm.logger import init_logger +from ipex_llm.vllm.outputs import RequestOutput +from ipex_llm.vllm.sampling_params import SamplingParams +from ipex_llm.vllm.transformers_utils.tokenizer import get_tokenizer import uuid -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError try: import fastchat diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/openai_protocol.py b/python/llm/src/ipex_llm/vllm/entrypoints/openai/openai_protocol.py similarity index 99% rename from python/llm/src/bigdl/llm/vllm/entrypoints/openai/openai_protocol.py rename to python/llm/src/ipex_llm/vllm/entrypoints/openai/openai_protocol.py index 5297d55b..f7e46aee 100644 --- a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/openai_protocol.py +++ b/python/llm/src/ipex_llm/vllm/entrypoints/openai/openai_protocol.py @@ -43,7 +43,7 @@ from typing import Dict, List, Literal, Optional, Union from pydantic import BaseModel, Field -from bigdl.llm.vllm.utils import random_uuid +from ipex_llm.vllm.utils import random_uuid # bigdl-llm change start # summary: add token time recording logic diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/protocol.py b/python/llm/src/ipex_llm/vllm/entrypoints/openai/protocol.py similarity index 97% rename from python/llm/src/bigdl/llm/vllm/entrypoints/openai/protocol.py rename to python/llm/src/ipex_llm/vllm/entrypoints/openai/protocol.py index 5c1218f8..24186f55 100644 --- a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/protocol.py +++ b/python/llm/src/ipex_llm/vllm/entrypoints/openai/protocol.py @@ -41,8 +41,8 @@ import time from typing import Dict, List, Literal, Optional, Union from pydantic import BaseModel, Field -from bigdl.llm.vllm.utils import random_uuid -from bigdl.llm.vllm.entrypoints.openai.openai_protocol import ( +from ipex_llm.vllm.utils import random_uuid +from ipex_llm.vllm.entrypoints.openai.openai_protocol import ( ErrorResponse, ModelPermission, ModelCard, UsageInfo, LogProbs, ChatMessage, DeltaMessage ) diff --git a/python/llm/src/bigdl/llm/vllm/logger.py b/python/llm/src/ipex_llm/vllm/logger.py similarity index 100% rename from python/llm/src/bigdl/llm/vllm/logger.py rename to python/llm/src/ipex_llm/vllm/logger.py diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/__init__.py b/python/llm/src/ipex_llm/vllm/model_executor/__init__.py similarity index 90% rename from python/llm/src/bigdl/llm/vllm/model_executor/__init__.py rename to python/llm/src/ipex_llm/vllm/model_executor/__init__.py index 54804458..d920a7a2 100644 --- a/python/llm/src/bigdl/llm/vllm/model_executor/__init__.py +++ b/python/llm/src/ipex_llm/vllm/model_executor/__init__.py @@ -14,7 +14,7 @@ # limitations under the License. # -from bigdl.llm.vllm.model_executor.model_loader import get_model +from ipex_llm.vllm.model_executor.model_loader import get_model __all__ = [ "get_model", diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/input_metadata.py b/python/llm/src/ipex_llm/vllm/model_executor/input_metadata.py similarity index 97% rename from python/llm/src/bigdl/llm/vllm/model_executor/input_metadata.py rename to python/llm/src/ipex_llm/vllm/model_executor/input_metadata.py index 0a7a24e5..7547078c 100644 --- a/python/llm/src/bigdl/llm/vllm/model_executor/input_metadata.py +++ b/python/llm/src/ipex_llm/vllm/model_executor/input_metadata.py @@ -34,8 +34,8 @@ from typing import Dict, List, Optional, Tuple import torch # from xformers.ops import AttentionBias -from bigdl.llm.vllm.sequence import SequenceData -from bigdl.llm.vllm.sampling_params import SamplingParams +from ipex_llm.vllm.sequence import SequenceData +from ipex_llm.vllm.sampling_params import SamplingParams class InputMetadata: diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/layers/bigdl_sampler.py b/python/llm/src/ipex_llm/vllm/model_executor/layers/bigdl_sampler.py similarity index 99% rename from python/llm/src/bigdl/llm/vllm/model_executor/layers/bigdl_sampler.py rename to python/llm/src/ipex_llm/vllm/model_executor/layers/bigdl_sampler.py index a355ef19..a3675ae6 100644 --- a/python/llm/src/bigdl/llm/vllm/model_executor/layers/bigdl_sampler.py +++ b/python/llm/src/ipex_llm/vllm/model_executor/layers/bigdl_sampler.py @@ -39,10 +39,10 @@ from typing import Dict, List, Optional, Tuple import torch import torch.nn as nn -from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata +from ipex_llm.vllm.model_executor.input_metadata import InputMetadata -from bigdl.llm.vllm.sampling_params import SamplingParams, SamplingType -from bigdl.llm.vllm.sequence import (SamplerOutput, SequenceGroupMetadata, +from ipex_llm.vllm.sampling_params import SamplingParams, SamplingType +from ipex_llm.vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceData, SequenceOutputs) import time diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/model_loader.py b/python/llm/src/ipex_llm/vllm/model_executor/model_loader.py similarity index 90% rename from python/llm/src/bigdl/llm/vllm/model_executor/model_loader.py rename to python/llm/src/ipex_llm/vllm/model_executor/model_loader.py index f4a449f0..ed3bfb18 100644 --- a/python/llm/src/bigdl/llm/vllm/model_executor/model_loader.py +++ b/python/llm/src/ipex_llm/vllm/model_executor/model_loader.py @@ -38,14 +38,14 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from bigdl.llm.vllm.config import ModelConfig -from bigdl.llm.vllm.model_executor.models.bigdl_llama import BigDLLlamaForCausalLM -from bigdl.llm.vllm.model_executor.models.bigdl_mixtral import BigDLMixtralForCausalLM -from bigdl.llm.vllm.model_executor.models.bigdl_mistral import BigDLMistralForCausalLM -from bigdl.llm.vllm.model_executor.models.bigdl_chatglm import BigDLChatGLMForCausalLM -from bigdl.llm.vllm.model_executor.models.bigdl_baichuan import BigDLBaichuanForCausalLM +from ipex_llm.vllm.config import ModelConfig +from ipex_llm.vllm.model_executor.models.bigdl_llama import BigDLLlamaForCausalLM +from ipex_llm.vllm.model_executor.models.bigdl_mixtral import BigDLMixtralForCausalLM +from ipex_llm.vllm.model_executor.models.bigdl_mistral import BigDLMistralForCausalLM +from ipex_llm.vllm.model_executor.models.bigdl_chatglm import BigDLChatGLMForCausalLM +from ipex_llm.vllm.model_executor.models.bigdl_baichuan import BigDLBaichuanForCausalLM -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError # bigdl-llm Intel specified code change # bigdl-llm change start diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_baichuan.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_baichuan.py similarity index 95% rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_baichuan.py rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_baichuan.py index 0369f9f2..3e39edf1 100644 --- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_baichuan.py +++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_baichuan.py @@ -20,13 +20,13 @@ from torch import nn from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig from typing import Optional, Tuple, List, Type, Dict -from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata -from bigdl.llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler -from bigdl.llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM -from bigdl.llm.vllm.logger import init_logger +from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata +from ipex_llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler +from ipex_llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM +from ipex_llm.vllm.logger import init_logger import math import time -from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata +from ipex_llm.vllm.model_executor.input_metadata import InputMetadata import os from transformers.generation.logits_process import ( LogitsProcessorList, @@ -68,9 +68,9 @@ class BigDLBaichuanForCausalLM(BigDLModelForCausalLM): super().__init__(config, device, max_model_len) self.config = config # Always enable bigdl-llm model - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM # TODO: we will need to pass the argument through command line argument - # from bigdl.llm import optimize_model + # from ipex_llm import optimize_model torch_dtype = 'auto' if load_in_low_bit == 'bf16': diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_chatglm.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_chatglm.py similarity index 95% rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_chatglm.py rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_chatglm.py index c20e57e5..5a95c63d 100644 --- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_chatglm.py +++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_chatglm.py @@ -20,13 +20,13 @@ from torch import nn from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig from typing import Optional, Tuple, List, Type, Dict -from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata -from bigdl.llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler -from bigdl.llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM -from bigdl.llm.vllm.logger import init_logger +from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata +from ipex_llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler +from ipex_llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM +from ipex_llm.vllm.logger import init_logger import math import time -from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata +from ipex_llm.vllm.model_executor.input_metadata import InputMetadata from transformers.generation.logits_process import ( LogitsProcessorList, RepetitionPenaltyLogitsProcessor, @@ -63,7 +63,7 @@ class BigDLChatGLMForCausalLM(BigDLModelForCausalLM): super().__init__(config, device, max_model_len) self.config = config # TODO(gc): later change this to a switch? - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM torch_dtype = 'auto' if load_in_low_bit == 'bf16': diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_llama.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_llama.py similarity index 95% rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_llama.py rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_llama.py index e5d66f37..1ab8695b 100644 --- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_llama.py +++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_llama.py @@ -20,13 +20,13 @@ from torch import nn from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig from typing import Optional, Tuple, List, Type, Dict -from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata -from bigdl.llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler -from bigdl.llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM -from bigdl.llm.vllm.logger import init_logger +from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata +from ipex_llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler +from ipex_llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM +from ipex_llm.vllm.logger import init_logger import math import time -from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata +from ipex_llm.vllm.model_executor.input_metadata import InputMetadata import os from transformers.generation.logits_process import ( LogitsProcessorList, @@ -68,9 +68,9 @@ class BigDLLlamaForCausalLM(BigDLModelForCausalLM): super().__init__(config, device, max_model_len) self.config = config # Always enable bigdl-llm model - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM # TODO: we will need to pass the argument through command line argument - # from bigdl.llm import optimize_model + # from ipex_llm import optimize_model torch_dtype = 'auto' if load_in_low_bit == 'bf16': diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mistral.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mistral.py similarity index 94% rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mistral.py rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mistral.py index 9bdf572a..9640c282 100644 --- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mistral.py +++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mistral.py @@ -20,13 +20,13 @@ from torch import nn from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig from typing import Optional, Tuple, List, Type, Dict -from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata -from bigdl.llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler -from bigdl.llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM -from bigdl.llm.vllm.logger import init_logger +from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata +from ipex_llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler +from ipex_llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM +from ipex_llm.vllm.logger import init_logger import math import time -from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata +from ipex_llm.vllm.model_executor.input_metadata import InputMetadata from transformers.generation.logits_process import ( LogitsProcessorList, RepetitionPenaltyLogitsProcessor, @@ -63,8 +63,8 @@ class BigDLMistralForCausalLM(BigDLModelForCausalLM): super().__init__(config, device, max_model_len) self.config = config # TODO(gc): later change this to a switch? - from bigdl.llm.transformers import AutoModelForCausalLM - # from bigdl.llm import optimize_model + from ipex_llm.transformers import AutoModelForCausalLM + # from ipex_llm import optimize_model torch_dtype = 'auto' diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mixtral.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mixtral.py similarity index 95% rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mixtral.py rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mixtral.py index c5be91c3..8946910a 100644 --- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mixtral.py +++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mixtral.py @@ -20,13 +20,13 @@ from torch import nn from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig from typing import Optional, Tuple, List, Type, Dict -from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata -from bigdl.llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler -from bigdl.llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM -from bigdl.llm.vllm.logger import init_logger +from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata +from ipex_llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler +from ipex_llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM +from ipex_llm.vllm.logger import init_logger import math import time -from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata +from ipex_llm.vllm.model_executor.input_metadata import InputMetadata from transformers.generation.logits_process import ( LogitsProcessorList, RepetitionPenaltyLogitsProcessor, @@ -63,7 +63,7 @@ class BigDLMixtralForCausalLM(BigDLModelForCausalLM): super().__init__(config, device, max_model_len) self.config = config # TODO(gc): later change this to a switch? - from bigdl.llm.transformers import AutoModelForCausalLM + from ipex_llm.transformers import AutoModelForCausalLM torch_dtype = 'auto' diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_model.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_model.py similarity index 97% rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_model.py rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_model.py index a81993dc..5d7ecf28 100644 --- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_model.py +++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_model.py @@ -19,9 +19,9 @@ from torch import nn from typing import Optional, Tuple, List, Type, Dict from transformers import LlamaConfig -from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata -from bigdl.llm.transformers.models.utils import extend_kv_cache -from bigdl.llm.vllm.logger import init_logger +from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata +from ipex_llm.transformers.models.utils import extend_kv_cache +from ipex_llm.vllm.logger import init_logger logger = init_logger(__name__) diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/utils.py b/python/llm/src/ipex_llm/vllm/model_executor/utils.py similarity index 100% rename from python/llm/src/bigdl/llm/vllm/model_executor/utils.py rename to python/llm/src/ipex_llm/vllm/model_executor/utils.py diff --git a/python/llm/src/bigdl/llm/vllm/outputs.py b/python/llm/src/ipex_llm/vllm/outputs.py similarity index 98% rename from python/llm/src/bigdl/llm/vllm/outputs.py rename to python/llm/src/ipex_llm/vllm/outputs.py index 15b5ca7b..ab608631 100644 --- a/python/llm/src/bigdl/llm/vllm/outputs.py +++ b/python/llm/src/ipex_llm/vllm/outputs.py @@ -35,7 +35,7 @@ from typing import Dict, List, Optional -from bigdl.llm.vllm.sequence import SequenceGroup, SequenceStatus +from ipex_llm.vllm.sequence import SequenceGroup, SequenceStatus class CompletionOutput: diff --git a/python/llm/src/bigdl/llm/vllm/sampling_params.py b/python/llm/src/ipex_llm/vllm/sampling_params.py similarity index 99% rename from python/llm/src/bigdl/llm/vllm/sampling_params.py rename to python/llm/src/ipex_llm/vllm/sampling_params.py index 3d0bfb0e..af7091c1 100644 --- a/python/llm/src/bigdl/llm/vllm/sampling_params.py +++ b/python/llm/src/ipex_llm/vllm/sampling_params.py @@ -35,7 +35,7 @@ from enum import IntEnum from functools import cached_property from typing import List, Optional, Union -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.utils.common import invalidInputError _SAMPLING_EPS = 1e-5 diff --git a/python/llm/src/bigdl/llm/vllm/sequence.py b/python/llm/src/ipex_llm/vllm/sequence.py similarity index 99% rename from python/llm/src/bigdl/llm/vllm/sequence.py rename to python/llm/src/ipex_llm/vllm/sequence.py index 987c8a1b..3594e90b 100644 --- a/python/llm/src/bigdl/llm/vllm/sequence.py +++ b/python/llm/src/ipex_llm/vllm/sequence.py @@ -35,8 +35,8 @@ import copy import enum import time from typing import Dict, List, Optional, Union -from bigdl.llm.vllm.sampling_params import SamplingParams -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.vllm.sampling_params import SamplingParams +from ipex_llm.utils.common import invalidInputError class SequenceStatus(enum.Enum): diff --git a/python/llm/src/bigdl/llm/vllm/transformers_utils/__init__.py b/python/llm/src/ipex_llm/vllm/transformers_utils/__init__.py similarity index 100% rename from python/llm/src/bigdl/llm/vllm/transformers_utils/__init__.py rename to python/llm/src/ipex_llm/vllm/transformers_utils/__init__.py diff --git a/python/llm/src/bigdl/llm/vllm/transformers_utils/tokenizer.py b/python/llm/src/ipex_llm/vllm/transformers_utils/tokenizer.py similarity index 98% rename from python/llm/src/bigdl/llm/vllm/transformers_utils/tokenizer.py rename to python/llm/src/ipex_llm/vllm/transformers_utils/tokenizer.py index 727763d6..589496db 100644 --- a/python/llm/src/bigdl/llm/vllm/transformers_utils/tokenizer.py +++ b/python/llm/src/ipex_llm/vllm/transformers_utils/tokenizer.py @@ -35,8 +35,8 @@ from typing import List, Optional, Tuple, Union from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -from bigdl.llm.vllm.logger import init_logger -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.vllm.logger import init_logger +from ipex_llm.utils.common import invalidInputError logger = init_logger(__name__) diff --git a/python/llm/src/bigdl/llm/vllm/utils.py b/python/llm/src/ipex_llm/vllm/utils.py similarity index 94% rename from python/llm/src/bigdl/llm/vllm/utils.py rename to python/llm/src/ipex_llm/vllm/utils.py index b1520671..821d47c9 100644 --- a/python/llm/src/bigdl/llm/vllm/utils.py +++ b/python/llm/src/ipex_llm/vllm/utils.py @@ -37,8 +37,8 @@ from typing import List, Optional, Tuple, Union from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast -from bigdl.llm.vllm.logger import init_logger -from bigdl.llm.utils.common import invalidInputError +from ipex_llm.vllm.logger import init_logger +from ipex_llm.utils.common import invalidInputError logger = init_logger(__name__) diff --git a/python/llm/src/bigdl/llm/vllm/worker/worker.py b/python/llm/src/ipex_llm/vllm/worker/worker.py similarity index 96% rename from python/llm/src/bigdl/llm/vllm/worker/worker.py rename to python/llm/src/ipex_llm/vllm/worker/worker.py index b0005e79..f5805131 100644 --- a/python/llm/src/bigdl/llm/vllm/worker/worker.py +++ b/python/llm/src/ipex_llm/vllm/worker/worker.py @@ -40,13 +40,13 @@ import warnings import numpy as np import random -from bigdl.llm.vllm.config import ModelConfig, SchedulerConfig -from bigdl.llm.vllm.model_executor.model_loader import get_model -from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata -from bigdl.llm.vllm.sampling_params import SamplingParams -from bigdl.llm.vllm.sequence import SequenceData, SamplerOutput, SequenceGroupMetadata -from bigdl.llm.utils.common import invalidInputError -from bigdl.llm.vllm.model_executor.utils import set_random_seed +from ipex_llm.vllm.config import ModelConfig, SchedulerConfig +from ipex_llm.vllm.model_executor.model_loader import get_model +from ipex_llm.vllm.model_executor.input_metadata import InputMetadata +from ipex_llm.vllm.sampling_params import SamplingParams +from ipex_llm.vllm.sequence import SequenceData, SamplerOutput, SequenceGroupMetadata +from ipex_llm.utils.common import invalidInputError +from ipex_llm.vllm.model_executor.utils import set_random_seed class Worker: diff --git a/python/llm/test/convert/test_convert_model.py b/python/llm/test/convert/test_convert_model.py index 1a0495d6..450e97fb 100644 --- a/python/llm/test/convert/test_convert_model.py +++ b/python/llm/test/convert/test_convert_model.py @@ -20,9 +20,9 @@ import tempfile from unittest import TestCase import shutil -from bigdl.llm import llm_convert -from bigdl.llm.transformers import AutoModelForCausalLM -from bigdl.llm.optimize import optimize_model, load_low_bit, low_memory_init +from ipex_llm import llm_convert +from ipex_llm.transformers import AutoModelForCausalLM +from ipex_llm.optimize import optimize_model, load_low_bit, low_memory_init llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH') diff --git a/python/llm/test/inference/test_call_models.py b/python/llm/test/inference/test_call_models.py index 1f888da5..e9b7175d 100644 --- a/python/llm/test/inference/test_call_models.py +++ b/python/llm/test/inference/test_call_models.py @@ -15,8 +15,8 @@ # -from bigdl.llm.models import Llama, Bloom, Gptneox, Starcoder -from bigdl.llm.transformers import LlamaForCausalLM, BloomForCausalLM, \ +from ipex_llm.models import Llama, Bloom, Gptneox, Starcoder +from ipex_llm.transformers import LlamaForCausalLM, BloomForCausalLM, \ GptneoxForCausalLM, StarcoderForCausalLM import pytest from unittest import TestCase diff --git a/python/llm/test/inference/test_optimize_model_api.py b/python/llm/test/inference/test_optimize_model_api.py index ff1269e5..99e3be5e 100644 --- a/python/llm/test/inference/test_optimize_model_api.py +++ b/python/llm/test/inference/test_optimize_model_api.py @@ -19,7 +19,7 @@ import os import pytest import time import torch -from bigdl.llm import optimize_model +from ipex_llm import optimize_model class TestOptimizeAPI(unittest.TestCase): diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py index f69abea0..1a72801c 100644 --- a/python/llm/test/inference/test_transformers_api.py +++ b/python/llm/test/inference/test_transformers_api.py @@ -22,7 +22,7 @@ import time import torch import pytest -from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM, AutoModelForSpeechSeq2Seq +from ipex_llm.transformers import AutoModel, AutoModelForCausalLM, AutoModelForSpeechSeq2Seq from transformers import AutoTokenizer, LlamaTokenizer class TestTransformersAPI(unittest.TestCase): @@ -93,7 +93,7 @@ class TestTransformersAPI(unittest.TestCase): self.assertTrue(res) def test_transformers_chatglm_for_causallm(self): - from bigdl.llm.transformers import ChatGLMForCausalLM + from ipex_llm.transformers import ChatGLMForCausalLM model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH') model = ChatGLMForCausalLM.from_pretrained(model_path, native=False, trust_remote_code=True, load_in_4bit=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/test/inference/test_transformesr_api_434.py b/python/llm/test/inference/test_transformesr_api_434.py index 67cfcd63..4de49e66 100644 --- a/python/llm/test/inference/test_transformesr_api_434.py +++ b/python/llm/test/inference/test_transformesr_api_434.py @@ -19,7 +19,7 @@ import pytest import tempfile import torch -from bigdl.llm.transformers import AutoModelForCausalLM +from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer diff --git a/python/llm/test/inference_gpu/test_layer_fast_rope.py b/python/llm/test/inference_gpu/test_layer_fast_rope.py index 9861c913..79e6f5a0 100644 --- a/python/llm/test/inference_gpu/test_layer_fast_rope.py +++ b/python/llm/test/inference_gpu/test_layer_fast_rope.py @@ -30,7 +30,7 @@ from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding from transformers.models.llama.modeling_llama import ( apply_rotary_pos_emb as apply_rotary_pos_emb_llama, ) -from bigdl.llm.transformers.layers.rope_embedding import apply_fast_rope_embedding +from ipex_llm.transformers.layers.rope_embedding import apply_fast_rope_embedding device = os.environ['DEVICE'] print(f'Running on {device}') diff --git a/python/llm/test/inference_gpu/test_transformers_api.py b/python/llm/test/inference_gpu/test_transformers_api.py index 9a9bb8a3..ae9c6b9b 100644 --- a/python/llm/test/inference_gpu/test_transformers_api.py +++ b/python/llm/test/inference_gpu/test_transformers_api.py @@ -20,7 +20,7 @@ import pytest import tempfile import torch -from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel, AutoModelForSpeechSeq2Seq +from ipex_llm.transformers import AutoModelForCausalLM, AutoModel, AutoModelForSpeechSeq2Seq from transformers import LlamaTokenizer, AutoTokenizer device = os.environ['DEVICE'] diff --git a/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py b/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py index 6876ca75..db3aa485 100644 --- a/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py +++ b/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py @@ -20,7 +20,7 @@ import gc import pytest import torch -from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel +from ipex_llm.transformers import AutoModelForCausalLM, AutoModel from transformers import LlamaTokenizer, AutoTokenizer device = os.environ['DEVICE'] diff --git a/python/llm/test/inference_gpu/test_transformers_api_attention.py b/python/llm/test/inference_gpu/test_transformers_api_attention.py index 0a7a6ac2..bf9df673 100644 --- a/python/llm/test/inference_gpu/test_transformers_api_attention.py +++ b/python/llm/test/inference_gpu/test_transformers_api_attention.py @@ -20,7 +20,7 @@ import gc import pytest import torch -from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel +from ipex_llm.transformers import AutoModelForCausalLM, AutoModel from transformers import LlamaTokenizer, AutoTokenizer device = os.environ['DEVICE'] diff --git a/python/llm/test/inference_gpu/test_transformers_api_final_logits.py b/python/llm/test/inference_gpu/test_transformers_api_final_logits.py index 02e3cc27..1aff7719 100644 --- a/python/llm/test/inference_gpu/test_transformers_api_final_logits.py +++ b/python/llm/test/inference_gpu/test_transformers_api_final_logits.py @@ -20,7 +20,7 @@ import gc import pytest import torch -from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel +from ipex_llm.transformers import AutoModelForCausalLM, AutoModel from transformers import LlamaTokenizer, AutoTokenizer device = os.environ['DEVICE'] diff --git a/python/llm/test/inference_gpu/test_transformers_api_layernorm.py b/python/llm/test/inference_gpu/test_transformers_api_layernorm.py index 0dbb4fe8..68a15d8a 100644 --- a/python/llm/test/inference_gpu/test_transformers_api_layernorm.py +++ b/python/llm/test/inference_gpu/test_transformers_api_layernorm.py @@ -19,7 +19,7 @@ import pytest import gc import torch -from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel +from ipex_llm.transformers import AutoModelForCausalLM, AutoModel from transformers import LlamaTokenizer, AutoTokenizer device = os.environ['DEVICE'] diff --git a/python/llm/test/inference_gpu/test_transformers_api_mlp.py b/python/llm/test/inference_gpu/test_transformers_api_mlp.py index 1c01b259..e614e561 100644 --- a/python/llm/test/inference_gpu/test_transformers_api_mlp.py +++ b/python/llm/test/inference_gpu/test_transformers_api_mlp.py @@ -19,7 +19,7 @@ import gc import pytest import torch -from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel +from ipex_llm.transformers import AutoModelForCausalLM, AutoModel from transformers import LlamaTokenizer, AutoTokenizer device = os.environ['DEVICE'] diff --git a/python/llm/test/install/test_install.py b/python/llm/test/install/test_install.py index 11a35ed0..74c12dee 100644 --- a/python/llm/test/install/test_install.py +++ b/python/llm/test/install/test_install.py @@ -15,7 +15,7 @@ # -import bigdl.llm +import ipex_llm import pytest from unittest import TestCase @@ -23,8 +23,8 @@ from unittest import TestCase class Test_LLM_Basics(TestCase): def test_naive(self): - from bigdl.llm.ggml import quantize - from bigdl.llm.utils.common import invalidInputError + from ipex_llm.ggml import quantize + from ipex_llm.utils.common import invalidInputError pass diff --git a/python/llm/test/langchain/test_langchain.py b/python/llm/test/langchain/test_langchain.py index 782744b9..33a60d20 100644 --- a/python/llm/test/langchain/test_langchain.py +++ b/python/llm/test/langchain/test_langchain.py @@ -14,8 +14,8 @@ # limitations under the License. # -from bigdl.llm.langchain.embeddings import * -from bigdl.llm.langchain.llms import * +from ipex_llm.langchain.embeddings import * +from ipex_llm.langchain.llms import * import pytest from unittest import TestCase import os diff --git a/python/llm/test/langchain/test_transformers_api.py b/python/llm/test/langchain/test_transformers_api.py index c8e2ac53..cbaaa1e0 100644 --- a/python/llm/test/langchain/test_transformers_api.py +++ b/python/llm/test/langchain/test_transformers_api.py @@ -14,9 +14,9 @@ # limitations under the License. # -from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM, \ +from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM, \ LlamaLLM, BloomLLM -from bigdl.llm.langchain.embeddings import TransformersEmbeddings, LlamaEmbeddings, \ +from ipex_llm.langchain.embeddings import TransformersEmbeddings, LlamaEmbeddings, \ BloomEmbeddings diff --git a/python/llm/test/langchain_gpu/test_transformers_api.py b/python/llm/test/langchain_gpu/test_transformers_api.py index a983cb7f..b4e714bd 100644 --- a/python/llm/test/langchain_gpu/test_transformers_api.py +++ b/python/llm/test/langchain_gpu/test_transformers_api.py @@ -14,9 +14,9 @@ # limitations under the License. # -from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM, \ +from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM, \ LlamaLLM, BloomLLM -from bigdl.llm.langchain.embeddings import TransformersEmbeddings, LlamaEmbeddings, \ +from ipex_llm.langchain.embeddings import TransformersEmbeddings, LlamaEmbeddings, \ BloomEmbeddings import pytest diff --git a/python/llm/test/llamaindex/test_llamaindex.py b/python/llm/test/llamaindex/test_llamaindex.py index c0ebf4c3..1e7c8cfe 100644 --- a/python/llm/test/llamaindex/test_llamaindex.py +++ b/python/llm/test/llamaindex/test_llamaindex.py @@ -17,7 +17,7 @@ import pytest from unittest import TestCase import os -from bigdl.llm.llamaindex.llms import BigdlLLM +from ipex_llm.llamaindex.llms import BigdlLLM class Test_LlamaIndex_Transformers_API(TestCase): def setUp(self): diff --git a/python/llm/test/llamaindex_gpu/test_llamaindex.py b/python/llm/test/llamaindex_gpu/test_llamaindex.py index 920ce534..b894a37f 100644 --- a/python/llm/test/llamaindex_gpu/test_llamaindex.py +++ b/python/llm/test/llamaindex_gpu/test_llamaindex.py @@ -18,7 +18,7 @@ import torch import pytest from unittest import TestCase import os -from bigdl.llm.llamaindex.llms import BigdlLLM +from ipex_llm.llamaindex.llms import BigdlLLM class Test_LlamaIndex_Transformers_API(TestCase): def setUp(self):