From be29c75c18201cbba7c9af690bb3b63c8b66ca16 Mon Sep 17 00:00:00 2001
From: binbin Deng <108676127+plusbang@users.noreply.github.com>
Date: Wed, 13 Sep 2023 14:47:47 +0800
Subject: [PATCH] LLM: refactor gpu examples (#8963)

* restructure

* change to hf-transformers-models/
---
 python/llm/example/gpu/README.md              | 19 ++++++++-
 .../gpu/hf-transformers-models/README.md      | 40 +++++++++++++++++++
 .../baichuan/README.md                        |  0
 .../baichuan/generate.py                      |  0
 .../chatglm2/README.md                        |  0
 .../chatglm2/generate.py                      |  0
 .../chatglm2/streamchat.py                    |  0
 .../chinese-llama2/README.md                  |  0
 .../chinese-llama2/generate.py                |  0
 .../falcon/README.md                          |  0
 .../falcon/falcon-7b-instruct/modelling_RW.py |  0
 .../falcon/generate.py                        |  0
 .../gpt-j/generate.py                         |  0
 .../gpt-j/readme.md                           |  0
 .../internlm/README.md                        |  0
 .../internlm/generate.py                      |  0
 .../llama2/README.md                          |  0
 .../llama2/generate.py                        |  0
 .../mpt/README.md                             |  0
 .../mpt/generate.py                           |  0
 .../qwen/README.md                            |  0
 .../qwen/generate.py                          |  0
 .../starcoder/generate.py                     |  0
 .../starcoder/readme.md                       |  0
 .../voiceassistant/README.md                  |  0
 .../voiceassistant/generate.py                |  0
 .../whisper/readme.md                         |  0
 .../whisper/recognize.py                      |  0
 .../llm/example/gpu/pytorch-models/README.md  | 25 ++++++++++++
 29 files changed, 82 insertions(+), 2 deletions(-)
 create mode 100644 python/llm/example/gpu/hf-transformers-models/README.md
 rename python/llm/example/gpu/{ => hf-transformers-models}/baichuan/README.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/baichuan/generate.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/chatglm2/README.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/chatglm2/generate.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/chatglm2/streamchat.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/chinese-llama2/README.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/chinese-llama2/generate.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/falcon/README.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/falcon/falcon-7b-instruct/modelling_RW.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/falcon/generate.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/gpt-j/generate.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/gpt-j/readme.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/internlm/README.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/internlm/generate.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/llama2/README.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/llama2/generate.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/mpt/README.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/mpt/generate.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/qwen/README.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/qwen/generate.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/starcoder/generate.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/starcoder/readme.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/voiceassistant/README.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/voiceassistant/generate.py (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/whisper/readme.md (100%)
 rename python/llm/example/gpu/{ => hf-transformers-models}/whisper/recognize.py (100%)
 create mode 100644 python/llm/example/gpu/pytorch-models/README.md

diff --git a/python/llm/example/gpu/README.md b/python/llm/example/gpu/README.md
index ade44908..c729b4d2 100644
--- a/python/llm/example/gpu/README.md
+++ b/python/llm/example/gpu/README.md
@@ -1,5 +1,20 @@
-# BigDL-LLM Transformers INT4 Optimization for Large Language Model on Intel GPUs
-You can use BigDL-LLM to run almost every Huggingface Transformer models with INT4 optimizations on your laptops with Intel GPUs. This directory contains example scripts to help you quickly get started using BigDL-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it.
+# BigDL-LLM INT4 Optimization for Large Language Model on Intel GPUs
+You can use BigDL-LLM to run almost every Huggingface Transformer models with INT4 optimizations on your laptops with Intel GPUs. Moreover, you can also use `optimize_model` API to accelerate general PyTorch models on Intel GPUs.
+
+## Verified models
+| Model      | Example                                                  |
+|------------|----------------------------------------------------------|
+| Baichuan   | [link](hf-transformers-models/baichuan)          | 
+| ChatGLM2   | [link](hf-transformers-models/chatglm2)          |
+| Chinese Llama2 | [link](hf-transformers-models/chinese-llama2)|
+| Falcon     | [link](hf-transformers-models/falcon)            |
+| GPT-J      | [link](hf-transformers-models/gpt-j)             |
+| InternLM   | [link](hf-transformers-models/internlm)          |
+| LLaMA 2    | [link](hf-transformers-models/llama2)            |
+| MPT        | [link](hf-transformers-models/mpt)               |
+| Qwen       | [link](hf-transformers-models/qwen)              |
+| StarCoder  | [link](hf-transformers-models/starcoder)         |
+| Whisper    | [link](hf-transformers-models/whisper)           |
 
 ## Verified Hardware Platforms
 
diff --git a/python/llm/example/gpu/hf-transformers-models/README.md b/python/llm/example/gpu/hf-transformers-models/README.md
new file mode 100644
index 00000000..2b8ecc8b
--- /dev/null
+++ b/python/llm/example/gpu/hf-transformers-models/README.md
@@ -0,0 +1,40 @@
+# BigDL-LLM Transformers INT4 Optimization for Large Language Model on Intel GPUs
+You can use BigDL-LLM to run almost every Huggingface Transformer models with INT4 optimizations on your laptops with Intel GPUs. This directory contains example scripts to help you quickly get started using BigDL-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it.
+
+## Verified models
+| Model      | Example                                                  |
+|------------|----------------------------------------------------------|
+| Baichuan   | [link](baichuan)          | 
+| ChatGLM2   | [link](chatglm2)          |
+| Chinese Llama2 | [link](chinese-llama2)|
+| Falcon     | [link](falcon)            |
+| GPT-J      | [link](gpt-j)             |
+| InternLM   | [link](internlm)          |
+| LLaMA 2    | [link](llama2)            |
+| MPT        | [link](mpt)               |
+| Qwen       | [link](qwen)              |
+| StarCoder  | [link](starcoder)         |
+| Whisper    | [link](whisper)           |
+
+## Verified Hardware Platforms
+
+- Intel Arc™ A-Series Graphics
+- Intel Data Center GPU Flex Series
+
+## Recommended Requirements
+To apply Intel GPU acceleration, there’re several steps for tools installation and environment preparation.
+
+Step 1, only Linux system is supported now, Ubuntu 22.04 is prefered.
+
+Step 2, please refer to our [driver installation](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities.
+> **Note**: IPEX 2.0.110+xpu requires Intel GPU Driver version is [Stable 647.21](https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html).
+
+Step 3, you also need to download and install [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html). OneMKL and DPC++ compiler are needed, others are optional.
+> **Note**: IPEX 2.0.110+xpu requires Intel® oneAPI Base Toolkit's version >= 2023.2.0.
+
+## Best Known Configuration on Linux
+For better performance, it is recommended to set environment variables on Linux:
+```bash
+export USE_XETLA=OFF
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+```
diff --git a/python/llm/example/gpu/baichuan/README.md b/python/llm/example/gpu/hf-transformers-models/baichuan/README.md
similarity index 100%
rename from python/llm/example/gpu/baichuan/README.md
rename to python/llm/example/gpu/hf-transformers-models/baichuan/README.md
diff --git a/python/llm/example/gpu/baichuan/generate.py b/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py
similarity index 100%
rename from python/llm/example/gpu/baichuan/generate.py
rename to python/llm/example/gpu/hf-transformers-models/baichuan/generate.py
diff --git a/python/llm/example/gpu/chatglm2/README.md b/python/llm/example/gpu/hf-transformers-models/chatglm2/README.md
similarity index 100%
rename from python/llm/example/gpu/chatglm2/README.md
rename to python/llm/example/gpu/hf-transformers-models/chatglm2/README.md
diff --git a/python/llm/example/gpu/chatglm2/generate.py b/python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py
similarity index 100%
rename from python/llm/example/gpu/chatglm2/generate.py
rename to python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py
diff --git a/python/llm/example/gpu/chatglm2/streamchat.py b/python/llm/example/gpu/hf-transformers-models/chatglm2/streamchat.py
similarity index 100%
rename from python/llm/example/gpu/chatglm2/streamchat.py
rename to python/llm/example/gpu/hf-transformers-models/chatglm2/streamchat.py
diff --git a/python/llm/example/gpu/chinese-llama2/README.md b/python/llm/example/gpu/hf-transformers-models/chinese-llama2/README.md
similarity index 100%
rename from python/llm/example/gpu/chinese-llama2/README.md
rename to python/llm/example/gpu/hf-transformers-models/chinese-llama2/README.md
diff --git a/python/llm/example/gpu/chinese-llama2/generate.py b/python/llm/example/gpu/hf-transformers-models/chinese-llama2/generate.py
similarity index 100%
rename from python/llm/example/gpu/chinese-llama2/generate.py
rename to python/llm/example/gpu/hf-transformers-models/chinese-llama2/generate.py
diff --git a/python/llm/example/gpu/falcon/README.md b/python/llm/example/gpu/hf-transformers-models/falcon/README.md
similarity index 100%
rename from python/llm/example/gpu/falcon/README.md
rename to python/llm/example/gpu/hf-transformers-models/falcon/README.md
diff --git a/python/llm/example/gpu/falcon/falcon-7b-instruct/modelling_RW.py b/python/llm/example/gpu/hf-transformers-models/falcon/falcon-7b-instruct/modelling_RW.py
similarity index 100%
rename from python/llm/example/gpu/falcon/falcon-7b-instruct/modelling_RW.py
rename to python/llm/example/gpu/hf-transformers-models/falcon/falcon-7b-instruct/modelling_RW.py
diff --git a/python/llm/example/gpu/falcon/generate.py b/python/llm/example/gpu/hf-transformers-models/falcon/generate.py
similarity index 100%
rename from python/llm/example/gpu/falcon/generate.py
rename to python/llm/example/gpu/hf-transformers-models/falcon/generate.py
diff --git a/python/llm/example/gpu/gpt-j/generate.py b/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py
similarity index 100%
rename from python/llm/example/gpu/gpt-j/generate.py
rename to python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py
diff --git a/python/llm/example/gpu/gpt-j/readme.md b/python/llm/example/gpu/hf-transformers-models/gpt-j/readme.md
similarity index 100%
rename from python/llm/example/gpu/gpt-j/readme.md
rename to python/llm/example/gpu/hf-transformers-models/gpt-j/readme.md
diff --git a/python/llm/example/gpu/internlm/README.md b/python/llm/example/gpu/hf-transformers-models/internlm/README.md
similarity index 100%
rename from python/llm/example/gpu/internlm/README.md
rename to python/llm/example/gpu/hf-transformers-models/internlm/README.md
diff --git a/python/llm/example/gpu/internlm/generate.py b/python/llm/example/gpu/hf-transformers-models/internlm/generate.py
similarity index 100%
rename from python/llm/example/gpu/internlm/generate.py
rename to python/llm/example/gpu/hf-transformers-models/internlm/generate.py
diff --git a/python/llm/example/gpu/llama2/README.md b/python/llm/example/gpu/hf-transformers-models/llama2/README.md
similarity index 100%
rename from python/llm/example/gpu/llama2/README.md
rename to python/llm/example/gpu/hf-transformers-models/llama2/README.md
diff --git a/python/llm/example/gpu/llama2/generate.py b/python/llm/example/gpu/hf-transformers-models/llama2/generate.py
similarity index 100%
rename from python/llm/example/gpu/llama2/generate.py
rename to python/llm/example/gpu/hf-transformers-models/llama2/generate.py
diff --git a/python/llm/example/gpu/mpt/README.md b/python/llm/example/gpu/hf-transformers-models/mpt/README.md
similarity index 100%
rename from python/llm/example/gpu/mpt/README.md
rename to python/llm/example/gpu/hf-transformers-models/mpt/README.md
diff --git a/python/llm/example/gpu/mpt/generate.py b/python/llm/example/gpu/hf-transformers-models/mpt/generate.py
similarity index 100%
rename from python/llm/example/gpu/mpt/generate.py
rename to python/llm/example/gpu/hf-transformers-models/mpt/generate.py
diff --git a/python/llm/example/gpu/qwen/README.md b/python/llm/example/gpu/hf-transformers-models/qwen/README.md
similarity index 100%
rename from python/llm/example/gpu/qwen/README.md
rename to python/llm/example/gpu/hf-transformers-models/qwen/README.md
diff --git a/python/llm/example/gpu/qwen/generate.py b/python/llm/example/gpu/hf-transformers-models/qwen/generate.py
similarity index 100%
rename from python/llm/example/gpu/qwen/generate.py
rename to python/llm/example/gpu/hf-transformers-models/qwen/generate.py
diff --git a/python/llm/example/gpu/starcoder/generate.py b/python/llm/example/gpu/hf-transformers-models/starcoder/generate.py
similarity index 100%
rename from python/llm/example/gpu/starcoder/generate.py
rename to python/llm/example/gpu/hf-transformers-models/starcoder/generate.py
diff --git a/python/llm/example/gpu/starcoder/readme.md b/python/llm/example/gpu/hf-transformers-models/starcoder/readme.md
similarity index 100%
rename from python/llm/example/gpu/starcoder/readme.md
rename to python/llm/example/gpu/hf-transformers-models/starcoder/readme.md
diff --git a/python/llm/example/gpu/voiceassistant/README.md b/python/llm/example/gpu/hf-transformers-models/voiceassistant/README.md
similarity index 100%
rename from python/llm/example/gpu/voiceassistant/README.md
rename to python/llm/example/gpu/hf-transformers-models/voiceassistant/README.md
diff --git a/python/llm/example/gpu/voiceassistant/generate.py b/python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py
similarity index 100%
rename from python/llm/example/gpu/voiceassistant/generate.py
rename to python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py
diff --git a/python/llm/example/gpu/whisper/readme.md b/python/llm/example/gpu/hf-transformers-models/whisper/readme.md
similarity index 100%
rename from python/llm/example/gpu/whisper/readme.md
rename to python/llm/example/gpu/hf-transformers-models/whisper/readme.md
diff --git a/python/llm/example/gpu/whisper/recognize.py b/python/llm/example/gpu/hf-transformers-models/whisper/recognize.py
similarity index 100%
rename from python/llm/example/gpu/whisper/recognize.py
rename to python/llm/example/gpu/hf-transformers-models/whisper/recognize.py
diff --git a/python/llm/example/gpu/pytorch-models/README.md b/python/llm/example/gpu/pytorch-models/README.md
new file mode 100644
index 00000000..6c958e7a
--- /dev/null
+++ b/python/llm/example/gpu/pytorch-models/README.md
@@ -0,0 +1,25 @@
+# BigDL-LLM INT4 Optimization for Large Language Model on Intel GPUs
+You can use `optimize_model` API to accelerate general PyTorch models on Intel servers and PCs. This directory contains example scripts to help you quickly get started using BigDL-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it.
+
+## Verified Hardware Platforms
+
+- Intel Arc™ A-Series Graphics
+- Intel Data Center GPU Flex Series
+
+## Recommended Requirements
+To apply Intel GPU acceleration, there’re several steps for tools installation and environment preparation.
+
+Step 1, only Linux system is supported now, Ubuntu 22.04 is prefered.
+
+Step 2, please refer to our [driver installation](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities.
+> **Note**: IPEX 2.0.110+xpu requires Intel GPU Driver version is [Stable 647.21](https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html).
+
+Step 3, you also need to download and install [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html). OneMKL and DPC++ compiler are needed, others are optional.
+> **Note**: IPEX 2.0.110+xpu requires Intel® oneAPI Base Toolkit's version >= 2023.2.0.
+
+## Best Known Configuration on Linux
+For better performance, it is recommended to set environment variables on Linux:
+```bash
+export USE_XETLA=OFF
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+```