diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 17e8f1fc..fc37e464 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -346,3 +346,17 @@ jobs:
source /home/arda/intel/oneapi/setvars.sh
fi
bash python/llm/test/run-llm-example-tests-gpu.sh
+
+ - name: Run LLM langchain test
+ shell: bash
+ run: |
+ pip install -U langchain==0.0.184
+ pip install -U chromadb==0.3.25
+ pip install -U pandas==2.0.3
+ # Specific oneapi position on arc ut test machines
+ if [[ '${{ matrix.pytorch-version }}' == '2.1' ]]; then
+ source /opt/intel/oneapi/setvars.sh
+ elif [[ '${{ matrix.pytorch-version }}' == '2.0' ]]; then
+ source /home/arda/intel/oneapi/setvars.sh
+ fi
+ bash python/llm/test/run-llm-langchain-tests-gpu.sh
diff --git a/python/llm/example/GPU/LangChain/transformer_int4_gpu/README.md b/python/llm/example/GPU/LangChain/transformer_int4_gpu/README.md
new file mode 100644
index 00000000..73b384ed
--- /dev/null
+++ b/python/llm/example/GPU/LangChain/transformer_int4_gpu/README.md
@@ -0,0 +1,93 @@
+# Langchain examples
+
+The examples in this folder shows how to use [LangChain](https://www.langchain.com/) with `bigdl-llm` on Intel GPU.
+
+### 1. Install bigdl-llm
+Follow the instructions in [GPU Install Guide](https://bigdl.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html) to install bigdl-llm
+
+### 2. Install Required Dependencies for langchain examples.
+
+```bash
+pip install langchain==0.0.184
+pip install -U chromadb==0.3.25
+pip install -U pandas==2.0.3
+```
+
+### 3. Configures OneAPI environment variables
+#### 3.1 Configurations for Linux
+```bash
+source /opt/intel/oneapi/setvars.sh
+```
+#### 3.2 Configurations for Windows
+```cmd
+call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
+```
+> Note: Please make sure you are using **CMD** (**Anaconda Prompt** if using conda) to run the command as PowerShell is not supported.
+### 4. Runtime Configurations
+For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device.
+#### 4.1 Configurations for Linux
+
+
+For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series
+
+```bash
+export USE_XETLA=OFF
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+```
+
+
+
+
+
+For Intel Data Center GPU Max Series
+
+```bash
+export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export ENABLE_SDP_FUSION=1
+```
+> Note: Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`.
+
+
+#### 4.2 Configurations for Windows
+
+
+For Intel iGPU
+
+```cmd
+set SYCL_CACHE_PERSISTENT=1
+set BIGDL_LLM_XMX_DISABLED=1
+```
+
+
+
+
+
+For Intel Arc™ A300-Series or Pro A60
+
+```cmd
+set SYCL_CACHE_PERSISTENT=1
+```
+
+
+
+
+
+For other Intel dGPU Series
+
+There is no need to set further environment variables.
+
+
+
+> Note: For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile.
+
+### 5. Run the examples
+
+#### 5.1. Streaming Chat
+
+```bash
+python chat.py -m MODEL_PATH -q QUESTION
+```
+arguments info:
+- `-m MODEL_PATH`: **required**, path to the model
+- `-q QUESTION`: question to ask. Default is `What is AI?`.
diff --git a/python/llm/example/GPU/LangChain/transformer_int4_gpu/chat.py b/python/llm/example/GPU/LangChain/transformer_int4_gpu/chat.py
new file mode 100644
index 00000000..e5ced801
--- /dev/null
+++ b/python/llm/example/GPU/LangChain/transformer_int4_gpu/chat.py
@@ -0,0 +1,65 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This would makes sure Python is aware there is more than one sub-package within bigdl,
+# physically located elsewhere.
+# Otherwise there would be module not found error in non-pip's setting as Python would
+# only search the first bigdl package and end up finding only one sub-package.
+
+import argparse
+
+from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
+from langchain import PromptTemplate, LLMChain
+from langchain import HuggingFacePipeline
+
+
+def main(args):
+
+ question = args.question
+ model_path = args.model_path
+ template ="""{question}"""
+
+ prompt = PromptTemplate(template=template, input_variables=["question"])
+
+ # llm = TransformersPipelineLLM.from_model_id(
+ # model_id=model_path,
+ # task="text-generation",
+ # model_kwargs={"temperature": 0, "max_length": 64, "trust_remote_code": True},
+ # device_map='xpu'
+ # )
+
+ llm = TransformersLLM.from_model_id(
+ model_id=model_path,
+ model_kwargs={"temperature": 0, "max_length": 64, "trust_remote_code": True},
+ device_map='xpu'
+ )
+
+ llm_chain = LLMChain(prompt=prompt, llm=llm)
+
+ output = llm_chain.run(question)
+ print("====output=====")
+ print(output)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='TransformersLLM Langchain Chat Example')
+ parser.add_argument('-m','--model-path', type=str, required=True,
+ help='the path to transformers model')
+ parser.add_argument('-q', '--question', type=str, default='What is AI?',
+ help='qustion you want to ask.')
+ args = parser.parse_args()
+
+ main(args)
diff --git a/python/llm/example/GPU/README.md b/python/llm/example/GPU/README.md
index 5e18af8a..ae41c717 100644
--- a/python/llm/example/GPU/README.md
+++ b/python/llm/example/GPU/README.md
@@ -7,6 +7,7 @@ This folder contains examples of running BigDL-LLM on Intel GPU:
- [LLM-Finetuning](LLM-Finetuning): running ***finetuning*** (such as LoRA, QLoRA, QA-LoRA, etc) using BigDL-LLM on Intel GPUs
- [vLLM-Serving](vLLM-Serving): running ***vLLM*** serving framework on intel GPUs (with BigDL-LLM low-bit optimized models)
- [Deepspeed-AutoTP](Deepspeed-AutoTP): running distributed inference using ***DeepSpeed AutoTP*** (with BigDL-LLM low-bit optimized models) on Intel GPUs
+- [LangChain](LangChain): running ***LangChain*** applications on BigDL-LLM
- [PyTorch-Models](PyTorch-Models): running any PyTorch model on BigDL-LLM (with "one-line code change")
- [Speculative-Decoding](Speculative-Decoding): running any ***Hugging Face Transformers*** model with ***self-speculative decoding*** on Intel GPUs
- [ModelScope-Models](ModelScope-Models): running ***ModelScope*** model with BigDL-LLM on Intel GPUs
diff --git a/python/llm/src/bigdl/llm/langchain/llms/transformersllm.py b/python/llm/src/bigdl/llm/langchain/llms/transformersllm.py
index 34dc4e03..2fb683a1 100644
--- a/python/llm/src/bigdl/llm/langchain/llms/transformersllm.py
+++ b/python/llm/src/bigdl/llm/langchain/llms/transformersllm.py
@@ -89,6 +89,7 @@ class TransformersLLM(LLM):
cls,
model_id: str,
model_kwargs: Optional[dict] = None,
+ device_map: str = 'cpu',
**kwargs: Any,
) -> LLM:
"""
@@ -131,6 +132,10 @@ class TransformersLLM(LLM):
except:
model = AutoModel.from_pretrained(model_id, load_in_4bit=True, **_model_kwargs)
+ # TODO: may refactore this code in the future
+ if 'xpu' in device_map:
+ model = model.to(device_map)
+
if "trust_remote_code" in _model_kwargs:
_model_kwargs = {
k: v for k, v in _model_kwargs.items() if k != "trust_remote_code"
@@ -149,6 +154,7 @@ class TransformersLLM(LLM):
cls,
model_id: str,
model_kwargs: Optional[dict] = None,
+ device_map: str = 'cpu',
**kwargs: Any,
) -> LLM:
"""
@@ -188,6 +194,10 @@ class TransformersLLM(LLM):
model = AutoModelForCausalLM.load_low_bit(model_id, **_model_kwargs)
except:
model = AutoModel.load_low_bit(model_id, **_model_kwargs)
+
+ # TODO: may refactore this code in the future
+ if 'xpu' in device_map:
+ model = model.to(device_map)
if "trust_remote_code" in _model_kwargs:
_model_kwargs = {
@@ -224,6 +234,7 @@ class TransformersLLM(LLM):
if self.streaming:
from transformers import TextStreamer
input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+ input_ids = input_ids.to(self.model.device)
streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
if stop is not None:
from transformers.generation.stopping_criteria import StoppingCriteriaList
@@ -240,6 +251,7 @@ class TransformersLLM(LLM):
return text
else:
input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+ input_ids = input_ids.to(self.model.device)
if stop is not None:
from transformers.generation.stopping_criteria import StoppingCriteriaList
from transformers.tools.agents import StopSequenceCriteria
diff --git a/python/llm/test/langchain_gpu/test_transformers_api.py b/python/llm/test/langchain_gpu/test_transformers_api.py
new file mode 100644
index 00000000..a983cb7f
--- /dev/null
+++ b/python/llm/test/langchain_gpu/test_transformers_api.py
@@ -0,0 +1,48 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM, \
+ LlamaLLM, BloomLLM
+from bigdl.llm.langchain.embeddings import TransformersEmbeddings, LlamaEmbeddings, \
+ BloomEmbeddings
+
+import pytest
+from unittest import TestCase
+import os
+
+device = os.environ['DEVICE']
+print(f'Running on {device}')
+
+class Test_Langchain_Transformers_API(TestCase):
+ def setUp(self):
+ self.llama_model_path = os.environ.get('LLAMA2_7B_ORIGIN_PATH')
+ thread_num = os.environ.get('THREAD_NUM')
+ if thread_num is not None:
+ self.n_threads = int(thread_num)
+ else:
+ self.n_threads = 2
+
+
+ def test_bigdl_llm(self):
+ texts = 'What is the capital of France?\n\n'
+ bigdl_llm = TransformersLLM.from_model_id(model_id=self.llama_model_path, model_kwargs={'trust_remote_code': True}, device_map=device)
+
+ output = bigdl_llm(texts)
+ res = "Paris" in output
+ self.assertTrue(res)
+
+if __name__ == '__main__':
+ pytest.main([__file__])
diff --git a/python/llm/test/run-llm-langchain-tests-gpu.sh b/python/llm/test/run-llm-langchain-tests-gpu.sh
new file mode 100644
index 00000000..bff28198
--- /dev/null
+++ b/python/llm/test/run-llm-langchain-tests-gpu.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
+export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
+export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/langchain_gpu
+
+export USE_XETLA=OFF
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export DEVICE='xpu'
+
+set -e
+
+echo "# Start testing inference"
+start=$(date "+%s")
+
+python -m pytest -s ${LLM_INFERENCE_TEST_DIR}
+
+now=$(date "+%s")
+time=$((now-start))
+
+echo "Bigdl-llm langchain gpu tests finished"
+echo "Time used:$time seconds"
\ No newline at end of file