From 714884414ee6765e5379c4290eff94dc02b737a8 Mon Sep 17 00:00:00 2001
From: Yina Chen <33650826+cyita@users.noreply.github.com>
Date: Thu, 21 Sep 2023 16:42:11 +0800
Subject: [PATCH 01/10] fix error (#9025)

---
 python/llm/src/bigdl/llm/transformers/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/llm/src/bigdl/llm/transformers/model.py b/python/llm/src/bigdl/llm/transformers/model.py
index d0c4aa0f..8f926517 100644
--- a/python/llm/src/bigdl/llm/transformers/model.py
+++ b/python/llm/src/bigdl/llm/transformers/model.py
@@ -30,6 +30,7 @@ def save_low_bit(self, *args, **kwargs):
     invalidInputError(self.config.to_dict().get("bigdl_transformers_low_bit", False),
                       f"Detected this model is not a low-bit model, please use from_pretrained's"
                       f" load_in_4bit or load_in_low_bit parameter to load a 4-bit model first.")
+    self.to('cpu')
     self.save_pretrained(*args, **kwargs)
     import json
     import os

From bf51ec40b22f379fe34821c1b1563f2bdab58462 Mon Sep 17 00:00:00 2001
From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com>
Date: Thu, 21 Sep 2023 17:16:07 +0800
Subject: [PATCH 02/10] LLM: Fix empty cache (#9024)

* fix

* fix

* update example
---
 .../example/gpu/hf-transformers-models/baichuan/generate.py   | 1 -
 .../example/gpu/hf-transformers-models/baichuan2/generate.py  | 1 -
 python/llm/src/bigdl/llm/transformers/models/baichuan.py      | 4 ++++
 python/llm/src/bigdl/llm/transformers/models/baichuan2.py     | 4 ++++
 python/llm/src/bigdl/llm/transformers/models/bloom.py         | 2 ++
 python/llm/src/bigdl/llm/transformers/models/chatglm.py       | 2 ++
 python/llm/src/bigdl/llm/transformers/models/chatglm2.py      | 2 ++
 python/llm/src/bigdl/llm/transformers/models/falcon.py        | 2 ++
 python/llm/src/bigdl/llm/transformers/models/gptj.py          | 2 ++
 python/llm/src/bigdl/llm/transformers/models/gptneox.py       | 2 ++
 python/llm/src/bigdl/llm/transformers/models/llama.py         | 2 ++
 python/llm/src/bigdl/llm/transformers/models/utils.py         | 2 --
 12 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py b/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py
index e5c099ba..7e1e2d0d 100644
--- a/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py
@@ -42,7 +42,6 @@ if __name__ == '__main__':
     # which convert the relevant layers in the model into INT4 format
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
-                                                 optimize_model=False,
                                                  trust_remote_code=True,
                                                  use_cache=True)
     model = model.to('xpu')
diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py b/python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py
index 16a7b9d8..ebb87ad9 100644
--- a/python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py
@@ -46,7 +46,6 @@ if __name__ == '__main__':
     # to obtain optimal performance with BigDL-LLM INT4 optimizations
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
-                                                 optimize_model=False,
                                                  trust_remote_code=True,
                                                  use_cache=True)
     model = model.to('xpu')
diff --git a/python/llm/src/bigdl/llm/transformers/models/baichuan.py b/python/llm/src/bigdl/llm/transformers/models/baichuan.py
index 5d2d735c..71a4e9de 100644
--- a/python/llm/src/bigdl/llm/transformers/models/baichuan.py
+++ b/python/llm/src/bigdl/llm/transformers/models/baichuan.py
@@ -70,6 +70,8 @@ def baichuan_attention_forward_7b(
         cache_k = past_key_value[0]
         cache_v = past_key_value[1]
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            if device.type == 'xpu':
+                torch.xpu.empty_cache()
             # allocate new
             new_cache_k, new_cache_v = create_kv_cache(bsz,
                                                        self.num_heads,
@@ -168,6 +170,8 @@ def baichuan_attention_forward_13b(
         cache_k = past_key_value[0]
         cache_v = past_key_value[1]
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            if device.type == 'xpu':
+                torch.xpu.empty_cache()
             # allocate new
             new_cache_k, new_cache_v = create_kv_cache(bsz,
                                                        self.num_heads,
diff --git a/python/llm/src/bigdl/llm/transformers/models/baichuan2.py b/python/llm/src/bigdl/llm/transformers/models/baichuan2.py
index b1179c55..64dc2532 100644
--- a/python/llm/src/bigdl/llm/transformers/models/baichuan2.py
+++ b/python/llm/src/bigdl/llm/transformers/models/baichuan2.py
@@ -82,6 +82,8 @@ def baichuan_attention_forward_7b(
         cache_k = past_key_value[0]
         cache_v = past_key_value[1]
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            if device.type == 'xpu':
+                torch.xpu.empty_cache()
             # allocate new
             new_cache_k, new_cache_v = create_kv_cache(bsz,
                                                        self.num_heads,
@@ -177,6 +179,8 @@ def baichuan_attention_forward_13b(
         cache_k = past_key_value[0]
         cache_v = past_key_value[1]
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            if device.type == 'xpu':
+                torch.xpu.empty_cache()
             # allocate new
             new_cache_k, new_cache_v = create_kv_cache(bsz,
                                                        self.num_heads,
diff --git a/python/llm/src/bigdl/llm/transformers/models/bloom.py b/python/llm/src/bigdl/llm/transformers/models/bloom.py
index a6d42920..f3e08cba 100644
--- a/python/llm/src/bigdl/llm/transformers/models/bloom.py
+++ b/python/llm/src/bigdl/llm/transformers/models/bloom.py
@@ -105,6 +105,8 @@ def bloom_attention_forward(
         cache_k = layer_past[0].transpose(1, 2).view(batch_size, self.num_heads, -1, self.head_dim)
         cache_v = layer_past[1].view(batch_size, self.num_heads, -1, self.head_dim)
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            if device.type == 'xpu':
+                torch.xpu.empty_cache()
             # allocate new
             new_cache_k, new_cache_v = create_kv_cache(
                 batch_size,
diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm.py b/python/llm/src/bigdl/llm/transformers/models/chatglm.py
index 6c1a0a8a..89525697 100644
--- a/python/llm/src/bigdl/llm/transformers/models/chatglm.py
+++ b/python/llm/src/bigdl/llm/transformers/models/chatglm.py
@@ -67,6 +67,8 @@ def attention_fn(
         cache_v = cache_v.permute(1, 2, 0, 3)
         past_length = cache_k.size(2)
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            if device.type == 'xpu':
+                torch.xpu.empty_cache()
             max_cache_length = past_length + cur_length + KV_CACHE_ALLOC_BLOCK_LENGTH
             new_cache_k, new_cache_v = create_kv_cache(batch_size,
                                                        self.num_attention_heads_per_partition,
diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm2.py b/python/llm/src/bigdl/llm/transformers/models/chatglm2.py
index d43452cb..5de558e9 100644
--- a/python/llm/src/bigdl/llm/transformers/models/chatglm2.py
+++ b/python/llm/src/bigdl/llm/transformers/models/chatglm2.py
@@ -151,6 +151,8 @@ def chatglm2_attention_forward_8eb45c(
         past_length = cache_k.size(2)
 
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            if device.type == 'xpu':
+                torch.xpu.empty_cache()
             max_cache_length = past_length + cur_length + KV_CACHE_ALLOC_BLOCK_LENGTH
             new_cache_k, new_cache_v = create_kv_cache(batch_size,
                                                        self.num_attention_heads_per_partition,
diff --git a/python/llm/src/bigdl/llm/transformers/models/falcon.py b/python/llm/src/bigdl/llm/transformers/models/falcon.py
index c8b1dcf1..0b8ef9c4 100644
--- a/python/llm/src/bigdl/llm/transformers/models/falcon.py
+++ b/python/llm/src/bigdl/llm/transformers/models/falcon.py
@@ -97,6 +97,8 @@ def rw_attention_forward_7b(
         cache_k = layer_past[0].view(batch_size, self.num_kv, -1, self.head_dim)
         cache_v = layer_past[1].view(batch_size, self.num_kv, -1, self.head_dim)
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            if device.type == 'xpu':
+                torch.xpu.empty_cache()
             # allocate new
             new_cache_k, new_cache_v = create_kv_cache(
                 batch_size,
diff --git a/python/llm/src/bigdl/llm/transformers/models/gptj.py b/python/llm/src/bigdl/llm/transformers/models/gptj.py
index 65674360..8e390fca 100644
--- a/python/llm/src/bigdl/llm/transformers/models/gptj.py
+++ b/python/llm/src/bigdl/llm/transformers/models/gptj.py
@@ -144,6 +144,8 @@ def gptj_attention_forward(
         past_length = cache_k.size(2)
 
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            if device.type == 'xpu':
+                torch.xpu.empty_cache()
             new_cache_k, new_cache_v = create_kv_cache(batch_size,
                                                        self.num_attention_heads,
                                                        self.head_dim,
diff --git a/python/llm/src/bigdl/llm/transformers/models/gptneox.py b/python/llm/src/bigdl/llm/transformers/models/gptneox.py
index a0e3edde..0d0c16c6 100644
--- a/python/llm/src/bigdl/llm/transformers/models/gptneox.py
+++ b/python/llm/src/bigdl/llm/transformers/models/gptneox.py
@@ -90,6 +90,8 @@ def gptneox_attention_forward(
         past_key = layer_past[0]
         past_value = layer_past[1]
         if past_key.stride()[1] <= past_key.size(2) * past_key.size(3):
+            if device.type == 'xpu':
+                torch.xpu.empty_cache()
             # allocate new
             new_past_key, new_past_value = create_kv_cache(bsz,
                                                            self.num_attention_heads,
diff --git a/python/llm/src/bigdl/llm/transformers/models/llama.py b/python/llm/src/bigdl/llm/transformers/models/llama.py
index 212abc2a..c8b07f63 100644
--- a/python/llm/src/bigdl/llm/transformers/models/llama.py
+++ b/python/llm/src/bigdl/llm/transformers/models/llama.py
@@ -112,6 +112,8 @@ def llama_attention_forward_4_31(
         cache_k = past_key_value[0]
         cache_v = past_key_value[1]
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            if device.type == 'xpu':
+                torch.xpu.empty_cache()
             # allocate new
             new_cache_k, new_cache_v = create_kv_cache(bsz,
                                                        self.num_key_value_heads,  # Support GQA
diff --git a/python/llm/src/bigdl/llm/transformers/models/utils.py b/python/llm/src/bigdl/llm/transformers/models/utils.py
index 8890de1a..8d85db74 100644
--- a/python/llm/src/bigdl/llm/transformers/models/utils.py
+++ b/python/llm/src/bigdl/llm/transformers/models/utils.py
@@ -19,8 +19,6 @@ from bigdl.llm.utils.common import invalidInputError
 
 
 def create_kv_cache(batch_size, num_heads, head_dim, current_length, max_length, dtype, device):
-    if device.type == 'xpu':
-        torch.xpu.empty_cache()
     key_cache_storage = torch.empty(batch_size, num_heads,
                                     max_length, head_dim,
                                     dtype=dtype, device=device)

From 868511cf02d39c00a9160d397dd387f4c93c56c1 Mon Sep 17 00:00:00 2001
From: Cengguang Zhang <potterguang101@gmail.com>
Date: Thu, 21 Sep 2023 18:12:20 +0800
Subject: [PATCH 03/10] LLM: fix kv cache issue of bloom and falcon. (#9029)

---
 .../llm/src/bigdl/llm/transformers/convert.py   | 17 +++++++++--------
 .../src/bigdl/llm/transformers/models/bloom.py  |  2 ++
 .../src/bigdl/llm/transformers/models/falcon.py |  8 ++++++--
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/bigdl/llm/transformers/convert.py
index ba9474e7..6f05cb59 100644
--- a/python/llm/src/bigdl/llm/transformers/convert.py
+++ b/python/llm/src/bigdl/llm/transformers/convert.py
@@ -181,7 +181,7 @@ def optimize(model):
         convert_forward(model,
                         module.GPTJAttention,
                         gptj_attention_forward)
-    elif "bloom" in model.config._name_or_path:
+    elif "bloom" in model.config.model_type:
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
         from bigdl.llm.transformers.models.bloom import bloom_attention_forward
@@ -189,17 +189,18 @@ def optimize(model):
                         module.BloomAttention,
                         bloom_attention_forward
                         )
-    elif "falcon" in model.config._name_or_path:
+    elif "falcon" in model.config.model_type or "RefinedWeb" in model.config.model_type:
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
         if "RWForCausalLM" in model.config.architectures:
             if hasattr(model.config, "multi_query"):
-                # falcon-7b
-                from bigdl.llm.transformers.models.falcon import rw_attention_forward_7b
-                convert_forward(model,
-                                module.Attention,
-                                rw_attention_forward_7b
-                                )
+                # falcon-7b need to check performance drop after kv cache support.
+                # from bigdl.llm.transformers.models.falcon import rw_attention_forward_7b
+                # convert_forward(model,
+                #                 module.Attention,
+                #                 rw_attention_forward_7b
+                #                 )
+                pass
             else:
                 # falcon-40b
                 from bigdl.llm.transformers.models.falcon import rw_attention_forward_40b
diff --git a/python/llm/src/bigdl/llm/transformers/models/bloom.py b/python/llm/src/bigdl/llm/transformers/models/bloom.py
index f3e08cba..d06f784a 100644
--- a/python/llm/src/bigdl/llm/transformers/models/bloom.py
+++ b/python/llm/src/bigdl/llm/transformers/models/bloom.py
@@ -96,6 +96,8 @@ def bloom_attention_forward(
         self.head_dim
     )
     _, _, kv_length = key_layer.shape
+    if layer_past is not None:
+        kv_length += layer_past[0].shape[-1]
     query_layer = query_layer.view(batch_size, self.num_heads, q_length, self.head_dim)
     key_layer = key_layer.transpose(1, 2).view(batch_size, self.num_heads, q_length, self.head_dim)
     value_layer = value_layer.view(batch_size, self.num_heads, q_length, self.head_dim)
diff --git a/python/llm/src/bigdl/llm/transformers/models/falcon.py b/python/llm/src/bigdl/llm/transformers/models/falcon.py
index 0b8ef9c4..dc66fed3 100644
--- a/python/llm/src/bigdl/llm/transformers/models/falcon.py
+++ b/python/llm/src/bigdl/llm/transformers/models/falcon.py
@@ -86,7 +86,8 @@ def rw_attention_forward_7b(
     query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, seq_len)
 
     _, kv_length, _ = key_layer.shape
-
+    if layer_past is not None:
+        kv_length += layer_past[0].shape[-2]
     query_layer = query_layer.view(batch_size, self.num_heads, q_length, self.head_dim)
     key_layer = key_layer.view(batch_size, self.num_kv, q_length, self.head_dim)
     value_layer = value_layer.view(batch_size, self.num_kv, q_length, self.head_dim)
@@ -266,6 +267,8 @@ def rw_attention_forward_40b(
     query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, seq_len)
 
     _, kv_length, _ = key_layer.shape
+    if layer_past is not None:
+        kv_length += layer_past[0].shape[-2]
     query_layer = query_layer.view(batch_size, self.num_heads, q_length, self.head_dim)
     key_layer = key_layer.view(batch_size, self.num_heads, q_length, self.head_dim)
     value_layer = value_layer.view(batch_size, self.num_heads, q_length, self.head_dim)
@@ -439,7 +442,8 @@ def falcon_attention_forward(
     query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length)
 
     _, kv_length, _ = key_layer.shape
-
+    if layer_past is not None:
+        kv_length += layer_past[0].shape[-2]
     query_layer = query_layer.view(batch_size, self.num_heads, query_length, self.head_dim)
     key_layer = key_layer.view(batch_size, self.num_heads, query_length, self.head_dim)
     value_layer = value_layer.view(batch_size, self.num_heads, query_length, self.head_dim)

From b943d73844537c89fab63a14d707c8ca02492a30 Mon Sep 17 00:00:00 2001
From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com>
Date: Thu, 21 Sep 2023 21:28:03 +0800
Subject: [PATCH 04/10] LLM: refactor kv cache (#9030)

* refactor utils

* meet code review; update all models

* small fix
---
 .../hf-transformers-models/falcon/generate.py |  1 -
 .../hf-transformers-models/gpt-j/generate.py  |  1 -
 .../bigdl/llm/transformers/models/baichuan.py | 38 +++++++++----------
 .../llm/transformers/models/baichuan2.py      | 36 +++++++++---------
 .../bigdl/llm/transformers/models/bloom.py    |  8 ++--
 .../bigdl/llm/transformers/models/chatglm.py  | 14 +++----
 .../bigdl/llm/transformers/models/chatglm2.py | 14 +++----
 .../bigdl/llm/transformers/models/falcon.py   | 16 ++++----
 .../src/bigdl/llm/transformers/models/gptj.py | 22 +++++------
 .../bigdl/llm/transformers/models/gptneox.py  | 20 +++++-----
 .../bigdl/llm/transformers/models/llama.py    | 20 +++++-----
 .../bigdl/llm/transformers/models/utils.py    | 11 +++++-
 12 files changed, 93 insertions(+), 108 deletions(-)

diff --git a/python/llm/example/gpu/hf-transformers-models/falcon/generate.py b/python/llm/example/gpu/hf-transformers-models/falcon/generate.py
index 41113d46..0edeb47c 100644
--- a/python/llm/example/gpu/hf-transformers-models/falcon/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/falcon/generate.py
@@ -44,7 +44,6 @@ if __name__ == '__main__':
     # which convert the relevant layers in the model into INT4 format
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
-                                                 optimize_model=False,
                                                  trust_remote_code=True,
                                                  use_cache=True)
     model = model.to('xpu')
diff --git a/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py b/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py
index 28c385dd..fb937216 100644
--- a/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py
@@ -42,7 +42,6 @@ if __name__ == '__main__':
     # which convert the relevant layers in the model into INT4 format
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
-                                                 optimize_model=False,
                                                  trust_remote_code=True,
                                                  use_cache=True)
     model = model.to('xpu')
diff --git a/python/llm/src/bigdl/llm/transformers/models/baichuan.py b/python/llm/src/bigdl/llm/transformers/models/baichuan.py
index 71a4e9de..298654f2 100644
--- a/python/llm/src/bigdl/llm/transformers/models/baichuan.py
+++ b/python/llm/src/bigdl/llm/transformers/models/baichuan.py
@@ -26,7 +26,7 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import create_kv_cache, append_kv_cache
+from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 from bigdl.llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
@@ -70,10 +70,8 @@ def baichuan_attention_forward_7b(
         cache_k = past_key_value[0]
         cache_v = past_key_value[1]
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
-            if device.type == 'xpu':
-                torch.xpu.empty_cache()
             # allocate new
-            new_cache_k, new_cache_v = create_kv_cache(bsz,
+            new_cache_k, new_cache_v = extend_kv_cache(bsz,
                                                        self.num_heads,
                                                        self.head_dim,
                                                        cache_k.size(2),
@@ -89,13 +87,13 @@ def baichuan_attention_forward_7b(
 
     elif use_cache:
         max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
-        new_key_states, new_value_states = create_kv_cache(bsz,
-                                                           self.num_heads,
-                                                           self.head_dim,
-                                                           kv_seq_len,
-                                                           max_cache_length,
-                                                           dtype=key_states.dtype,
-                                                           device=device)
+        new_key_states, new_value_states = init_kv_cache(bsz,
+                                                         self.num_heads,
+                                                         self.head_dim,
+                                                         kv_seq_len,
+                                                         max_cache_length,
+                                                         dtype=key_states.dtype,
+                                                         device=device)
         new_key_states[:] = key_states
         new_value_states[:] = value_states
         key_states = new_key_states
@@ -170,10 +168,8 @@ def baichuan_attention_forward_13b(
         cache_k = past_key_value[0]
         cache_v = past_key_value[1]
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
-            if device.type == 'xpu':
-                torch.xpu.empty_cache()
             # allocate new
-            new_cache_k, new_cache_v = create_kv_cache(bsz,
+            new_cache_k, new_cache_v = extend_kv_cache(bsz,
                                                        self.num_heads,
                                                        self.head_dim,
                                                        cache_k.size(2),
@@ -189,13 +185,13 @@ def baichuan_attention_forward_13b(
 
     elif use_cache:
         max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
-        new_key_states, new_value_states = create_kv_cache(bsz,
-                                                           self.num_heads,
-                                                           self.head_dim,
-                                                           kv_seq_len,
-                                                           max_cache_length,
-                                                           dtype=key_states.dtype,
-                                                           device=device)
+        new_key_states, new_value_states = init_kv_cache(bsz,
+                                                         self.num_heads,
+                                                         self.head_dim,
+                                                         kv_seq_len,
+                                                         max_cache_length,
+                                                         dtype=key_states.dtype,
+                                                         device=device)
         new_key_states[:] = key_states
         new_value_states[:] = value_states
         key_states = new_key_states
diff --git a/python/llm/src/bigdl/llm/transformers/models/baichuan2.py b/python/llm/src/bigdl/llm/transformers/models/baichuan2.py
index 64dc2532..08d392e8 100644
--- a/python/llm/src/bigdl/llm/transformers/models/baichuan2.py
+++ b/python/llm/src/bigdl/llm/transformers/models/baichuan2.py
@@ -26,7 +26,7 @@ from torch import nn
 from torch.nn import functional as F
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import create_kv_cache, append_kv_cache
+from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 from bigdl.llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb
 from transformers.utils import logging, ContextManagers
 logger = logging.get_logger(__name__)
@@ -82,10 +82,8 @@ def baichuan_attention_forward_7b(
         cache_k = past_key_value[0]
         cache_v = past_key_value[1]
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
-            if device.type == 'xpu':
-                torch.xpu.empty_cache()
             # allocate new
-            new_cache_k, new_cache_v = create_kv_cache(bsz,
+            new_cache_k, new_cache_v = extend_kv_cache(bsz,
                                                        self.num_heads,
                                                        self.head_dim,
                                                        cache_k.size(2),
@@ -101,13 +99,13 @@ def baichuan_attention_forward_7b(
 
     elif use_cache:
         max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
-        new_key_states, new_value_states = create_kv_cache(bsz,
-                                                           self.num_heads,
-                                                           self.head_dim,
-                                                           kv_seq_len,
-                                                           max_cache_length,
-                                                           dtype=key_states.dtype,
-                                                           device=device)
+        new_key_states, new_value_states = init_kv_cache(bsz,
+                                                         self.num_heads,
+                                                         self.head_dim,
+                                                         kv_seq_len,
+                                                         max_cache_length,
+                                                         dtype=key_states.dtype,
+                                                         device=device)
         new_key_states[:] = key_states
         new_value_states[:] = value_states
         key_states = new_key_states
@@ -182,7 +180,7 @@ def baichuan_attention_forward_13b(
             if device.type == 'xpu':
                 torch.xpu.empty_cache()
             # allocate new
-            new_cache_k, new_cache_v = create_kv_cache(bsz,
+            new_cache_k, new_cache_v = extend_kv_cache(bsz,
                                                        self.num_heads,
                                                        self.head_dim,
                                                        cache_k.size(2),
@@ -198,13 +196,13 @@ def baichuan_attention_forward_13b(
 
     elif use_cache:
         max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
-        new_key_states, new_value_states = create_kv_cache(bsz,
-                                                           self.num_heads,
-                                                           self.head_dim,
-                                                           kv_seq_len,
-                                                           max_cache_length,
-                                                           dtype=key_states.dtype,
-                                                           device=device)
+        new_key_states, new_value_states = init_kv_cache(bsz,
+                                                         self.num_heads,
+                                                         self.head_dim,
+                                                         kv_seq_len,
+                                                         max_cache_length,
+                                                         dtype=key_states.dtype,
+                                                         device=device)
         new_key_states[:] = key_states
         new_value_states[:] = value_states
         key_states = new_key_states
diff --git a/python/llm/src/bigdl/llm/transformers/models/bloom.py b/python/llm/src/bigdl/llm/transformers/models/bloom.py
index d06f784a..e44a26c8 100644
--- a/python/llm/src/bigdl/llm/transformers/models/bloom.py
+++ b/python/llm/src/bigdl/llm/transformers/models/bloom.py
@@ -37,7 +37,7 @@ from typing import Optional, Tuple
 import torch
 import torch.utils.checkpoint
 from torch.nn import functional as F
-from bigdl.llm.transformers.models.utils import create_kv_cache, append_kv_cache
+from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
@@ -107,10 +107,8 @@ def bloom_attention_forward(
         cache_k = layer_past[0].transpose(1, 2).view(batch_size, self.num_heads, -1, self.head_dim)
         cache_v = layer_past[1].view(batch_size, self.num_heads, -1, self.head_dim)
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
-            if device.type == 'xpu':
-                torch.xpu.empty_cache()
             # allocate new
-            new_cache_k, new_cache_v = create_kv_cache(
+            new_cache_k, new_cache_v = extend_kv_cache(
                 batch_size,
                 self.num_heads,
                 self.head_dim,
@@ -128,7 +126,7 @@ def bloom_attention_forward(
 
     elif use_cache:
         max_cache_length = kv_length + KV_CACHE_ALLOC_BLOCK_LENGTH
-        new_key_states, new_value_states = create_kv_cache(
+        new_key_states, new_value_states = init_kv_cache(
             batch_size,
             self.num_heads,
             self.head_dim,
diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm.py b/python/llm/src/bigdl/llm/transformers/models/chatglm.py
index 89525697..4f773772 100644
--- a/python/llm/src/bigdl/llm/transformers/models/chatglm.py
+++ b/python/llm/src/bigdl/llm/transformers/models/chatglm.py
@@ -22,7 +22,7 @@ import torch
 import torch.utils.checkpoint
 import torch.nn.functional as F
 from typing import Optional, Tuple
-from bigdl.llm.transformers.models.utils import create_kv_cache, append_kv_cache
+from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 
 
 def rotate_half(x):
@@ -67,10 +67,8 @@ def attention_fn(
         cache_v = cache_v.permute(1, 2, 0, 3)
         past_length = cache_k.size(2)
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
-            if device.type == 'xpu':
-                torch.xpu.empty_cache()
             max_cache_length = past_length + cur_length + KV_CACHE_ALLOC_BLOCK_LENGTH
-            new_cache_k, new_cache_v = create_kv_cache(batch_size,
+            new_cache_k, new_cache_v = extend_kv_cache(batch_size,
                                                        self.num_attention_heads_per_partition,
                                                        self.hidden_size_per_attention_head,
                                                        past_length,
@@ -84,10 +82,10 @@ def attention_fn(
     elif use_cache:
         max_cache_length = max(KV_CACHE_ALLOC_MIN_LENGTH, cur_length) \
             + KV_CACHE_ALLOC_BLOCK_LENGTH
-        key_cache, value_cache = create_kv_cache(batch_size, self.num_attention_heads_per_partition,
-                                                 self.hidden_size_per_attention_head, cur_length,
-                                                 max_cache_length,
-                                                 dtype=query_layer.dtype, device=device)
+        key_cache, value_cache = init_kv_cache(batch_size, self.num_attention_heads_per_partition,
+                                               self.hidden_size_per_attention_head, cur_length,
+                                               max_cache_length,
+                                               dtype=query_layer.dtype, device=device)
         key_cache[:] = key_layer
         value_cache[:] = value_layer
         key_layer = key_cache
diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm2.py b/python/llm/src/bigdl/llm/transformers/models/chatglm2.py
index 5de558e9..7dc90f86 100644
--- a/python/llm/src/bigdl/llm/transformers/models/chatglm2.py
+++ b/python/llm/src/bigdl/llm/transformers/models/chatglm2.py
@@ -20,7 +20,7 @@
 import torch
 from typing import Optional, Tuple, Union, List, Callable, Dict, Any
 import torch.nn.functional as F
-from bigdl.llm.transformers.models.utils import create_kv_cache, append_kv_cache
+from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
@@ -151,10 +151,8 @@ def chatglm2_attention_forward_8eb45c(
         past_length = cache_k.size(2)
 
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
-            if device.type == 'xpu':
-                torch.xpu.empty_cache()
             max_cache_length = past_length + cur_length + KV_CACHE_ALLOC_BLOCK_LENGTH
-            new_cache_k, new_cache_v = create_kv_cache(batch_size,
+            new_cache_k, new_cache_v = extend_kv_cache(batch_size,
                                                        self.num_attention_heads_per_partition,
                                                        self.hidden_size_per_attention_head,
                                                        past_length,
@@ -172,10 +170,10 @@ def chatglm2_attention_forward_8eb45c(
 
         max_cache_length = max(KV_CACHE_ALLOC_MIN_LENGTH, cur_length) \
             + KV_CACHE_ALLOC_BLOCK_LENGTH
-        key_cache, value_cache = create_kv_cache(batch_size, self.num_attention_heads_per_partition,
-                                                 self.hidden_size_per_attention_head, cur_length,
-                                                 max_cache_length,
-                                                 dtype=query_layer.dtype, device=device)
+        key_cache, value_cache = init_kv_cache(batch_size, self.num_attention_heads_per_partition,
+                                               self.hidden_size_per_attention_head, cur_length,
+                                               max_cache_length,
+                                               dtype=query_layer.dtype, device=device)
         key_cache[:] = key_layer
         value_cache[:] = value_layer
         key_layer = key_cache
diff --git a/python/llm/src/bigdl/llm/transformers/models/falcon.py b/python/llm/src/bigdl/llm/transformers/models/falcon.py
index dc66fed3..3a2c565d 100644
--- a/python/llm/src/bigdl/llm/transformers/models/falcon.py
+++ b/python/llm/src/bigdl/llm/transformers/models/falcon.py
@@ -38,7 +38,7 @@ from typing import Optional, Tuple
 import torch
 from torch.nn import functional as F
 from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import create_kv_cache, append_kv_cache
+from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
@@ -98,10 +98,8 @@ def rw_attention_forward_7b(
         cache_k = layer_past[0].view(batch_size, self.num_kv, -1, self.head_dim)
         cache_v = layer_past[1].view(batch_size, self.num_kv, -1, self.head_dim)
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
-            if device.type == 'xpu':
-                torch.xpu.empty_cache()
             # allocate new
-            new_cache_k, new_cache_v = create_kv_cache(
+            new_cache_k, new_cache_v = extend_kv_cache(
                 batch_size,
                 self.num_kv,
                 self.head_dim,
@@ -119,7 +117,7 @@ def rw_attention_forward_7b(
 
     elif use_cache:
         max_cache_length = kv_length + KV_CACHE_ALLOC_BLOCK_LENGTH
-        new_key_states, new_value_states = create_kv_cache(
+        new_key_states, new_value_states = init_kv_cache(
             batch_size,
             self.num_kv,
             self.head_dim,
@@ -280,7 +278,7 @@ def rw_attention_forward_40b(
         cache_v = layer_past[1].view(batch_size, self.num_heads, -1, self.head_dim)
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
             # allocate new
-            new_cache_k, new_cache_v = create_kv_cache(
+            new_cache_k, new_cache_v = extend_kv_cache(
                 batch_size,
                 self.num_heads,
                 self.head_dim,
@@ -298,7 +296,7 @@ def rw_attention_forward_40b(
 
     elif use_cache:
         max_cache_length = kv_length + KV_CACHE_ALLOC_BLOCK_LENGTH
-        new_key_states, new_value_states = create_kv_cache(
+        new_key_states, new_value_states = init_kv_cache(
             batch_size,
             self.num_heads,
             self.head_dim,
@@ -454,7 +452,7 @@ def falcon_attention_forward(
         cache_v = layer_past[1].view(batch_size, self.num_heads, -1, self.head_dim)
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
             # allocate new
-            new_cache_k, new_cache_v = create_kv_cache(
+            new_cache_k, new_cache_v = extend_kv_cache(
                 batch_size,
                 self.num_heads,
                 self.head_dim,
@@ -472,7 +470,7 @@ def falcon_attention_forward(
 
     elif use_cache:
         max_cache_length = kv_length + KV_CACHE_ALLOC_BLOCK_LENGTH
-        new_key_states, new_value_states = create_kv_cache(
+        new_key_states, new_value_states = init_kv_cache(
             batch_size,
             self.num_heads,
             self.head_dim,
diff --git a/python/llm/src/bigdl/llm/transformers/models/gptj.py b/python/llm/src/bigdl/llm/transformers/models/gptj.py
index 8e390fca..e904a520 100644
--- a/python/llm/src/bigdl/llm/transformers/models/gptj.py
+++ b/python/llm/src/bigdl/llm/transformers/models/gptj.py
@@ -19,8 +19,8 @@
 
 import torch
 from typing import Optional, Tuple, Union
-from bigdl.llm.transformers.models.utils import create_kv_cache, append_kv_cache, \
-    apply_rotary_pos_emb
+from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \
+    apply_rotary_pos_emb, append_kv_cache
 from transformers.utils.import_utils import is_torch_fx_proxy
 
 
@@ -144,9 +144,7 @@ def gptj_attention_forward(
         past_length = cache_k.size(2)
 
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
-            if device.type == 'xpu':
-                torch.xpu.empty_cache()
-            new_cache_k, new_cache_v = create_kv_cache(batch_size,
+            new_cache_k, new_cache_v = extend_kv_cache(batch_size,
                                                        self.num_attention_heads,
                                                        self.head_dim,
                                                        past_length,
@@ -160,13 +158,13 @@ def gptj_attention_forward(
         key, value = append_kv_cache(cache_k, cache_v, key, value)
 
     elif use_cache:
-        key_cache, value_cache = create_kv_cache(batch_size,
-                                                 self.num_attention_heads,
-                                                 self.head_dim,
-                                                 kv_seq_len,
-                                                 kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH,
-                                                 dtype=key.dtype,
-                                                 device=device)
+        key_cache, value_cache = init_kv_cache(batch_size,
+                                               self.num_attention_heads,
+                                               self.head_dim,
+                                               kv_seq_len,
+                                               kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH,
+                                               dtype=key.dtype,
+                                               device=device)
         key_cache[:] = key
         value_cache[:] = value
         key = key_cache
diff --git a/python/llm/src/bigdl/llm/transformers/models/gptneox.py b/python/llm/src/bigdl/llm/transformers/models/gptneox.py
index 0d0c16c6..8e31a14a 100644
--- a/python/llm/src/bigdl/llm/transformers/models/gptneox.py
+++ b/python/llm/src/bigdl/llm/transformers/models/gptneox.py
@@ -34,7 +34,7 @@
 import torch
 from typing import Optional, Tuple
 from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb
-from bigdl.llm.transformers.models.utils import create_kv_cache, append_kv_cache
+from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
@@ -90,10 +90,8 @@ def gptneox_attention_forward(
         past_key = layer_past[0]
         past_value = layer_past[1]
         if past_key.stride()[1] <= past_key.size(2) * past_key.size(3):
-            if device.type == 'xpu':
-                torch.xpu.empty_cache()
             # allocate new
-            new_past_key, new_past_value = create_kv_cache(bsz,
+            new_past_key, new_past_value = extend_kv_cache(bsz,
                                                            self.num_attention_heads,
                                                            self.head_size,
                                                            past_key.size(2),
@@ -108,13 +106,13 @@ def gptneox_attention_forward(
         key, value = append_kv_cache(past_key, past_value, key, value)
     elif use_cache:
         max_cache_length = seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
-        new_key, new_value = create_kv_cache(bsz,
-                                             self.num_attention_heads,
-                                             self.head_size,
-                                             seq_len,
-                                             max_cache_length,
-                                             dtype=key.dtype,
-                                             device=device)
+        new_key, new_value = init_kv_cache(bsz,
+                                           self.num_attention_heads,
+                                           self.head_size,
+                                           seq_len,
+                                           max_cache_length,
+                                           dtype=key.dtype,
+                                           device=device)
         new_key[:] = key
         new_value[:] = value
         key = new_key
diff --git a/python/llm/src/bigdl/llm/transformers/models/llama.py b/python/llm/src/bigdl/llm/transformers/models/llama.py
index c8b07f63..51ddb2ee 100644
--- a/python/llm/src/bigdl/llm/transformers/models/llama.py
+++ b/python/llm/src/bigdl/llm/transformers/models/llama.py
@@ -37,7 +37,7 @@ from typing import Optional, Tuple
 import math
 import torch.nn.functional as F
 from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import create_kv_cache, append_kv_cache
+from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 from bigdl.llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb
 
 
@@ -112,10 +112,8 @@ def llama_attention_forward_4_31(
         cache_k = past_key_value[0]
         cache_v = past_key_value[1]
         if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
-            if device.type == 'xpu':
-                torch.xpu.empty_cache()
             # allocate new
-            new_cache_k, new_cache_v = create_kv_cache(bsz,
+            new_cache_k, new_cache_v = extend_kv_cache(bsz,
                                                        self.num_key_value_heads,  # Support GQA
                                                        self.head_dim,
                                                        cache_k.size(2),
@@ -131,13 +129,13 @@ def llama_attention_forward_4_31(
 
     elif use_cache:
         max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
-        new_key_states, new_value_states = create_kv_cache(bsz,
-                                                           self.num_key_value_heads,
-                                                           self.head_dim,
-                                                           kv_seq_len,
-                                                           max_cache_length,
-                                                           dtype=key_states.dtype,
-                                                           device=device)
+        new_key_states, new_value_states = init_kv_cache(bsz,
+                                                         self.num_key_value_heads,
+                                                         self.head_dim,
+                                                         kv_seq_len,
+                                                         max_cache_length,
+                                                         dtype=key_states.dtype,
+                                                         device=device)
         new_key_states[:] = key_states
         new_value_states[:] = value_states
         key_states = new_key_states
diff --git a/python/llm/src/bigdl/llm/transformers/models/utils.py b/python/llm/src/bigdl/llm/transformers/models/utils.py
index 8d85db74..b47ad8e7 100644
--- a/python/llm/src/bigdl/llm/transformers/models/utils.py
+++ b/python/llm/src/bigdl/llm/transformers/models/utils.py
@@ -18,7 +18,7 @@ import torch
 from bigdl.llm.utils.common import invalidInputError
 
 
-def create_kv_cache(batch_size, num_heads, head_dim, current_length, max_length, dtype, device):
+def init_kv_cache(batch_size, num_heads, head_dim, current_length, max_length, dtype, device):
     key_cache_storage = torch.empty(batch_size, num_heads,
                                     max_length, head_dim,
                                     dtype=dtype, device=device)
@@ -27,7 +27,7 @@ def create_kv_cache(batch_size, num_heads, head_dim, current_length, max_length,
                                       dtype=dtype, device=device)
 
     key_cache = key_cache_storage.as_strided((batch_size, num_heads,
-                                             current_length, head_dim),
+                                              current_length, head_dim),
                                              key_cache_storage.stride(),
                                              storage_offset=0)
     value_cache = value_cache_storage.as_strided((batch_size, num_heads,
@@ -37,6 +37,13 @@ def create_kv_cache(batch_size, num_heads, head_dim, current_length, max_length,
     return key_cache, value_cache
 
 
+def extend_kv_cache(batch_size, num_heads, head_dim, current_length, max_length, dtype, device):
+    # empty cache to reduce gpu memory
+    if device.type == 'xpu':
+        torch.xpu.empty_cache()
+    return init_kv_cache(batch_size, num_heads, head_dim, current_length, max_length, dtype, device)
+
+
 def append_kv_cache(cache_k, cache_v, key_states, value_states):
     new_size = (cache_k.size(0),
                 cache_k.size(1),

From 9126abdf9bb0a38bb35de69558064e907123d046 Mon Sep 17 00:00:00 2001
From: Lilac09 <74996885+Zhengjin-Wang@users.noreply.github.com>
Date: Fri, 22 Sep 2023 09:03:57 +0800
Subject: [PATCH 05/10] add README.md for bigdl-llm-cpu image (#9026)

* modify Dockerfile

* add README.md

* add README.md
---
 docker/llm/inference/cpu/docker/Dockerfile |  3 +-
 docker/llm/inference/cpu/docker/README.md  | 34 ++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 docker/llm/inference/cpu/docker/README.md

diff --git a/docker/llm/inference/cpu/docker/Dockerfile b/docker/llm/inference/cpu/docker/Dockerfile
index 97bb2a0d..1e5143f9 100644
--- a/docker/llm/inference/cpu/docker/Dockerfile
+++ b/docker/llm/inference/cpu/docker/Dockerfile
@@ -13,6 +13,7 @@ RUN env DEBIAN_FRONTEND=noninteractive apt-get update && \
     ln -s /usr/bin/python3 /usr/bin/python && \
     apt-get install -y python3-pip python3.9-dev python3-wheel python3.9-distutils && \
     pip3 install --no-cache --upgrade requests argparse urllib3 && \
-    pip3 install --pre --upgrade bigdl-llm[all] 
+    pip3 install --pre --upgrade bigdl-llm[all] && \
+    pip3 install --pre --upgrade bigdl-nano
 
 ENTRYPOINT ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/llm/inference/cpu/docker/README.md b/docker/llm/inference/cpu/docker/README.md
new file mode 100644
index 00000000..805cfc07
--- /dev/null
+++ b/docker/llm/inference/cpu/docker/README.md
@@ -0,0 +1,34 @@
+## Build/Use BigDL-LLM cpu image
+
+### Build Image
+```bash
+docker build \
+  --build-arg http_proxy=.. \
+  --build-arg https_proxy=.. \
+  --build-arg no_proxy=.. \
+  --rm --no-cache -t intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT .
+```
+
+
+### Use the image for doing cpu inference
+
+
+An example could be:
+```bash
+#/bin/bash
+export DOCKER_IMAGE=intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT
+
+sudo docker run -itd \
+        --net=host \
+        --cpuset-cpus="0-47" \
+        --cpuset-mems="0" \
+        --memory="32G" \
+        --name=CONTAINER_NAME \
+        --shm-size="16g" \
+        $DOCKER_IMAGE
+```
+
+
+After the container is booted, you could get into the container through `docker exec`.
+
+To run inference using `BigDL-LLM` using cpu, you could refer to this [documentation](https://github.com/intel-analytics/BigDL/tree/main/python/llm#cpu-int4).

From 028a6d9383959d051f39db887ae1ff1f376137d2 Mon Sep 17 00:00:00 2001
From: Jiao Wang <jenniewang123@gmail.com>
Date: Thu, 21 Sep 2023 21:27:23 -0700
Subject: [PATCH 06/10] MPT model optimize for long sequence (#9020)

* mpt_long_seq

* update

* update

* update

* style

* style2

* update
---
 .../llm/src/bigdl/llm/transformers/convert.py |  10 +-
 .../src/bigdl/llm/transformers/models/mpt.py  | 149 ++++++++++++++++++
 2 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 python/llm/src/bigdl/llm/transformers/models/mpt.py

diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/bigdl/llm/transformers/convert.py
index 6f05cb59..518c8a5d 100644
--- a/python/llm/src/bigdl/llm/transformers/convert.py
+++ b/python/llm/src/bigdl/llm/transformers/convert.py
@@ -173,6 +173,15 @@ def optimize(model):
                         module.SelfAttention,
                         chatglm_attention_forward
                         )
+    elif "mpt" in model.config._name_or_path:
+        modeling_module_name = model.__class__.__module__
+        attention_module_name = '.'.join(modeling_module_name.split('.')[:-1]) + ".attention"
+        module = importlib.import_module(attention_module_name)
+        from bigdl.llm.transformers.models.mpt import mpt_multihead_attention_forward
+        convert_forward(model,
+                        module.MultiheadAttention,
+                        mpt_multihead_attention_forward
+                        )
     elif "gptj" in model.config.model_type:
         # dolly-v1-6b
         modeling_module_name = model.__class__.__module__
@@ -263,5 +272,4 @@ def optimize(model):
                         transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXAttention,
                         gptneox_attention_forward
                         )
-
     return model
diff --git a/python/llm/src/bigdl/llm/transformers/models/mpt.py b/python/llm/src/bigdl/llm/transformers/models/mpt.py
new file mode 100644
index 00000000..fd8e28b7
--- /dev/null
+++ b/python/llm/src/bigdl/llm/transformers/models/mpt.py
@@ -0,0 +1,149 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://huggingface.co/mosaicml/mpt-7b-chat/blob/main/attention.py
+#
+
+import warnings
+import torch
+from einops import rearrange
+import math
+import torch.nn.functional as F
+from bigdl.llm.utils.common import invalidInputError
+from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
+
+
+KV_CACHE_ALLOC_BLOCK_LENGTH = 256
+
+
+def mpt_multihead_attention_forward(self, x, past_key_value=None, attn_bias=None,
+                                    attention_mask=None, is_causal=True, needs_weights=False):
+    qkv = self.Wqkv(x)
+    if self.clip_qkv:
+        qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+    (query, key, value) = qkv.chunk(3, dim=2)
+    key_padding_mask = attention_mask
+    if self.qk_ln:
+        dtype = query.dtype
+        query = self.q_ln(query).to(dtype)
+        key = self.k_ln(key).to(dtype)
+    (context, attn_weights, past_key_value) = \
+        mpt_scaled_multihead_dot_product_attention(query, key, value, self.n_heads,
+                                                   past_key_value=past_key_value,
+                                                   softmax_scale=self.softmax_scale,
+                                                   attn_bias=attn_bias,
+                                                   key_padding_mask=key_padding_mask,
+                                                   is_causal=is_causal,
+                                                   dropout_p=self.attn_dropout_p,
+                                                   training=self.training,
+                                                   needs_weights=needs_weights)
+    return (self.out_proj(context), attn_weights, past_key_value)
+
+
+def mpt_scaled_multihead_dot_product_attention(query, key, value, n_heads,
+                                               past_key_value=None,
+                                               softmax_scale=None,
+                                               attn_bias=None,
+                                               key_padding_mask=None,
+                                               is_causal=False,
+                                               dropout_p=0.0,
+                                               training=False,
+                                               needs_weights=False,
+                                               multiquery=False):
+    q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
+    bsz, n_heads, q_len, head_dim = q.size()
+    device = q.device
+    kv_n_heads = 1 if multiquery else n_heads
+    k = rearrange(key, 'b s (h d) -> b h d s', h=kv_n_heads)
+    v = rearrange(value, 'b s (h d) -> b h s d', h=kv_n_heads)
+    kv_seq_len = k.shape[-1]
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            # k = torch.cat([past_key_value[0], k], dim=3)
+            # v = torch.cat([past_key_value[1], v], dim=2)
+            cache_k = past_key_value[0].transpose(2, 3)
+            cache_v = past_key_value[1]
+            kv_seq_len += cache_k.shape[-2]
+            if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+                # allocate new
+                new_cache_k, new_cache_v = extend_kv_cache(bsz,
+                                                           kv_n_heads,  # Support GQA
+                                                           head_dim,
+                                                           cache_k.size(2),
+                                                           kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH,
+                                                           dtype=cache_k.dtype,
+                                                           device=device)
+                new_cache_k[:] = cache_k
+                new_cache_v[:] = cache_v
+                cache_k = new_cache_k
+                cache_v = new_cache_v
+            key_states, value_states = append_kv_cache(cache_k, cache_v, k.transpose(2, 3), v)
+            k = key_states.transpose(2, 3)
+            v = value_states
+        else:
+            max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
+            new_key_states, new_value_states = init_kv_cache(bsz,
+                                                             kv_n_heads,
+                                                             head_dim,
+                                                             kv_seq_len,
+                                                             max_cache_length,
+                                                             dtype=k.dtype,
+                                                             device=device)
+            new_key_states[:] = k.transpose(2, 3)
+            new_value_states[:] = v
+            k = new_key_states.transpose(2, 3)
+            v = new_value_states
+        past_key_value = (k, v)
+    (b, _, s_q, d) = q.shape
+    s_k = k.size(-1)
+    if softmax_scale is None:
+        softmax_scale = 1 / math.sqrt(d)
+    attn_weight = q.matmul(k) * softmax_scale
+    if attn_bias is not None:
+        _s_q = max(0, attn_bias.size(2) - s_q)
+        _s_k = max(0, attn_bias.size(3) - s_k)
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+        if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k \
+                or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
+            invalidInputError(False, f'attn_bias (shape: {attn_bias.shape}) '
+                                     f'is expected to broadcast to shape: {attn_weight.shape}.')
+        attn_weight = attn_weight + attn_bias
+    min_val = torch.finfo(q.dtype).min
+    if key_padding_mask is not None:
+        if attn_bias is not None:
+            warnings.warn('Propogating key_padding_mask to the attention module '
+                          + 'and applying it within the attention module can cause '
+                          + 'unneccessary computation/memory usage. Consider integrating '
+                          + 'into attn_bias once and passing that to each attention '
+                          + 'module instead.')
+        attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
+    if is_causal and (not q.size(2) == 1):
+        s = max(s_q, s_k)
+        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
+        causal_mask = causal_mask.tril()
+        causal_mask = causal_mask.to(torch.bool)
+        causal_mask = ~causal_mask
+        causal_mask = causal_mask[-s_q:, -s_k:]
+        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    if dropout_p:
+        attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p,
+                                                  training=training, inplace=True)
+    out = attn_weight.to(v.dtype).matmul(v)
+    out = rearrange(out, 'b h s d -> b s (h d)')
+    if needs_weights:
+        return (out, attn_weight, past_key_value)
+    return (out, None, past_key_value)

From 023555fb1f14c73091425d71e88d649d00bd156e Mon Sep 17 00:00:00 2001
From: JinBridge <89779290+JinBridger@users.noreply.github.com>
Date: Fri, 22 Sep 2023 14:46:30 +0800
Subject: [PATCH 07/10] LLM: Add one-click installer for Windows (#8999)

* LLM: init one-click installer for windows

* LLM: fix typo in one-click installer readme

* LLM: one-click installer try except logic

* LLM: one-click installer add dependency

* LLM: one-click installer adjust README.md

* LLM: one-click installer split README and add zip compress in setup.bat

* LLM: one-click installer verified internlm and llama2 and replace gif

* LLM: remove one-click installer images

* LLM: finetune the one-click installer README.md

* LLM: fix typo in one-click installer README.md

* LLM: rename one-click installer to protable executable

* LLM: rename other places to protable executable

* LLM: rename the zip filename to executable

* LLM: update .gitignore

* LLM: add colorama to setup.bat
---
 python/llm/portable-executable/.gitignore |   2 +
 python/llm/portable-executable/README.md  |  33 ++++++
 python/llm/portable-executable/chat.bat   |   8 ++
 python/llm/portable-executable/chat.py    | 116 ++++++++++++++++++++++
 python/llm/portable-executable/setup.bat  |  23 +++++
 python/llm/portable-executable/setup.md   |   5 +
 6 files changed, 187 insertions(+)
 create mode 100644 python/llm/portable-executable/.gitignore
 create mode 100644 python/llm/portable-executable/README.md
 create mode 100644 python/llm/portable-executable/chat.bat
 create mode 100644 python/llm/portable-executable/chat.py
 create mode 100644 python/llm/portable-executable/setup.bat
 create mode 100644 python/llm/portable-executable/setup.md

diff --git a/python/llm/portable-executable/.gitignore b/python/llm/portable-executable/.gitignore
new file mode 100644
index 00000000..23c0161c
--- /dev/null
+++ b/python/llm/portable-executable/.gitignore
@@ -0,0 +1,2 @@
+python-embed
+portable-executable.zip
\ No newline at end of file
diff --git a/python/llm/portable-executable/README.md b/python/llm/portable-executable/README.md
new file mode 100644
index 00000000..0f1df88f
--- /dev/null
+++ b/python/llm/portable-executable/README.md
@@ -0,0 +1,33 @@
+# BigDL-LLM Portable Executable For Windows: User Guide
+
+This portable executable includes everything you need to run LLM (except models). Please refer to How to use section to get started.
+
+## 13B model running on an Intel 11-Gen Core PC (real-time screen capture)
+
+<p align="left">
+            <img src=https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-screen-capture.gif width='80%' />
+
+</p>
+
+## Verified Models
+
+- ChatGLM2-6b
+- Baichuan-13B-Chat
+- Baichuan2-7B-Chat
+- internlm-chat-7b-8k
+- Llama-2-7b-chat-hf
+
+## How to use
+
+1. Download the model to your computer. Please ensure there is a file named `config.json` in the model folder, otherwise the script won't work.
+
+   ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step1.png)
+
+2. Run `chat.bat` in Terminal and input the path of the model (e.g. `path\to\model`, note that there's no slash at the end of the path).
+
+   ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step2.png)
+
+3. Press Enter and wait until model finishes loading. Then enjoy chatting with the model!
+4. If you want to stop chatting, just input `stop` and the model will stop running.
+
+   ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step34.png)
diff --git a/python/llm/portable-executable/chat.bat b/python/llm/portable-executable/chat.bat
new file mode 100644
index 00000000..b02c9615
--- /dev/null
+++ b/python/llm/portable-executable/chat.bat
@@ -0,0 +1,8 @@
+@echo off
+
+
+:: execute chat script
+set PYTHONUNBUFFERED=1
+
+set /p modelpath="Please enter the model path: "
+.\python-embed\python.exe .\chat.py --model-path="%modelpath%"
\ No newline at end of file
diff --git a/python/llm/portable-executable/chat.py b/python/llm/portable-executable/chat.py
new file mode 100644
index 00000000..8a282a97
--- /dev/null
+++ b/python/llm/portable-executable/chat.py
@@ -0,0 +1,116 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import torch
+import argparse
+import sys
+
+# todo: support more model class
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from transformers import TextIteratorStreamer
+from transformers.tools.agents import StopSequenceCriteria
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+
+from colorama import Fore
+
+from bigdl.llm import optimize_model
+
+SYSTEM_PROMPT = "A chat between a curious human <human> and an artificial intelligence assistant <bot>.\
+The assistant gives helpful, detailed, and polite answers to the human's questions."
+HUMAN_ID = "<human>"
+BOT_ID = "<bot>"
+
+# chat_history formated in [(iput_str, output_str)]
+def format_prompt(input_str,
+                  chat_history):
+    prompt = [f"{SYSTEM_PROMPT}\n"]
+    for history_input_str, history_output_str in chat_history:
+        prompt.append(f"{HUMAN_ID} {history_input_str}\n{BOT_ID} {history_output_str}\n")
+    prompt.append(f"{HUMAN_ID} {input_str}\n{BOT_ID} ")
+
+    return "".join(prompt)
+
+def stream_chat(model,
+                tokenizer,
+                stopping_criteria,
+                input_str,
+                chat_history):
+    prompt = format_prompt(input_str, chat_history)
+    # print(prompt)
+    input_ids = tokenizer([prompt], return_tensors="pt")
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(input_ids, streamer=streamer, max_new_tokens=512, stopping_criteria=stopping_criteria)
+
+    from threading import Thread
+    # to ensure non-blocking access to the generated text, generation process should be ran in a separate thread
+    thread = Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
+
+    output_str = []
+    print(Fore.BLUE+"BigDL-LLM: "+Fore.RESET, end="")
+    for partial_output_str in streamer:
+        output_str.append(partial_output_str)
+        # remove the last HUMAN_ID if exists
+        print(partial_output_str.replace(f"{HUMAN_ID}", ""), end="")
+
+    chat_history.append((input_str, "".join(output_str).replace(f"{HUMAN_ID}", "").rstrip()))
+
+def auto_select_model(model_name):
+    try:
+        try:
+            model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                        low_cpu_mem_usage=True,
+                                                        torch_dtype="auto",
+                                                        trust_remote_code=True,
+                                                        use_cache=True)
+        except:
+            model = AutoModel.from_pretrained(model_path,
+                                             low_cpu_mem_usage=True,
+                                             torch_dtype="auto",
+                                             trust_remote_code=True,
+                                             use_cache=True)
+    except:
+        print("Sorry, the model you entered is not supported in installer.")
+        sys.exit()
+    
+    return model
+  
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--model-path", type=str, help="path to an llm")
+  args = parser.parse_args()
+
+  model_path = args.model_path
+  
+  model = auto_select_model(model_path)
+  model = optimize_model(model)
+
+  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+  stopping_criteria = StoppingCriteriaList([StopSequenceCriteria(HUMAN_ID, tokenizer)])
+
+  chat_history = []
+
+  while True:
+      with torch.inference_mode():
+          user_input = input(Fore.GREEN+"\nHuman: "+Fore.RESET)
+          if user_input == "stop": # let's stop the conversation when user input "stop"
+              break
+          stream_chat(model=model,
+                      tokenizer=tokenizer,
+                      stopping_criteria=stopping_criteria,
+                      input_str=user_input,
+                      chat_history=chat_history)
\ No newline at end of file
diff --git a/python/llm/portable-executable/setup.bat b/python/llm/portable-executable/setup.bat
new file mode 100644
index 00000000..de8ad28c
--- /dev/null
+++ b/python/llm/portable-executable/setup.bat
@@ -0,0 +1,23 @@
+:: download python and extract zip
+powershell -Command "Start-BitsTransfer -Source https://www.python.org/ftp/python/3.9.13/python-3.9.13-embed-amd64.zip -Destination python-3.9.13-embed-amd64.zip"
+powershell -Command "Expand-Archive .\python-3.9.13-embed-amd64.zip -DestinationPath .\python-embed"
+del .\python-3.9.13-embed-amd64.zip
+
+set "python-embed=.\python-embed\python.exe"
+
+:: download get-pip.py and install
+powershell -Command "Invoke-WebRequest https://bootstrap.pypa.io/get-pip.py -OutFile .\python-embed\get-pip.py"
+%python-embed% .\python-embed\get-pip.py
+
+:: enable run site.main() automatically
+cd .\python-embed
+set "search=#import site"
+set "replace=import site"
+powershell -Command "(gc python39._pth) -replace '%search%', '%replace%' | Out-File -encoding ASCII python39._pth"
+cd ..
+
+:: install pip packages
+%python-embed% -m pip install bigdl-llm[all] transformers_stream_generator tiktoken einops colorama
+
+:: compress the python and scripts
+powershell -Command "Compress-Archive -Path '.\python-embed', '.\chat.bat', '.\chat.py', '.\README.md' -DestinationPath .\portable-executable.zip"
diff --git a/python/llm/portable-executable/setup.md b/python/llm/portable-executable/setup.md
new file mode 100644
index 00000000..22520c64
--- /dev/null
+++ b/python/llm/portable-executable/setup.md
@@ -0,0 +1,5 @@
+# BigDL-LLM Portable Executable Setup Script For Windows
+
+# How to use
+
+Just simply run `setup.bat` and it will download and install all dependency and generate a zip file for user to use.

From 26213a58296c5e8419320c196e19231fe6ebb1d9 Mon Sep 17 00:00:00 2001
From: Cengguang Zhang <potterguang101@gmail.com>
Date: Fri, 22 Sep 2023 17:38:38 +0800
Subject: [PATCH 08/10] LLM: Change benchmark bf16 load format. (#9035)

* LLM: Change benchmark bf16 load format.

* comment on bf16 chatglm.

* fix.
---
 python/llm/dev/benchmark/all-in-one/run.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index fd35ceea..9d3115e8 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -176,18 +176,16 @@ def run_pytorch_autocast_bf16(repo_id,
     st = time.perf_counter()
     if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']:
         # TODO: need verify chatglm family run bf16.
-        model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype='auto').float()
-        #model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype='auto').bfloat()
-        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        invalidInputError(False, "Currently pytorch do not support bfloat16 on cpu for chatglm models.")
     elif repo_id in ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
                      'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
                      'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5',
                      'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']:
-        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype='auto')
+        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
         # Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944
         tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
     else:
-        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype='auto')
+        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     end = time.perf_counter()
     print(">> loading of model costs {}s".format(end - st))

From 4b843d1dbfb9a0b7db359711193296ef0c78a302 Mon Sep 17 00:00:00 2001
From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com>
Date: Mon, 25 Sep 2023 09:28:44 +0800
Subject: [PATCH 09/10] change lora-model output behavior on k8s (#9038)

Co-authored-by: leonardozcm <leonardo1997zcm@gmail.com>
---
 docker/llm/finetune/lora/README.md                        | 8 +++++---
 .../lora/docker/bigdl-lora-finetuing-entrypoint.sh        | 7 +++----
 .../kubernetes/templates/bigdl-lora-finetuning-job.yaml   | 8 +-------
 .../templates/bigdl-lora-finetuning-tdx-job.yaml          | 6 ------
 docker/llm/finetune/lora/kubernetes/values.yaml           | 3 +--
 5 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/docker/llm/finetune/lora/README.md b/docker/llm/finetune/lora/README.md
index 81b61add..1492cf90 100644
--- a/docker/llm/finetune/lora/README.md
+++ b/docker/llm/finetune/lora/README.md
@@ -22,13 +22,13 @@ Follow [here](https://github.com/kubeflow/mpi-operator/tree/master#installation)
 
 Follow [here](https://github.com/intel-analytics/BigDL/tree/main/docker/llm/finetune/lora/docker#prepare-bigdl-image-for-lora-finetuning) to prepare BigDL Lora Finetuning image in your cluster.
 
-As finetuning is from a base model, first download [Llama 7b hf model from the public download site of Hugging Face](https://huggingface.co/decapoda-research/llama-7b-hf/tree/main). Then, download [cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. Next, move the downloaded files to a shared directory on your NFS server. In addition, make an empty directory under the same destination to save the finetuned model output later.
+As finetuning is from a base model, first download [Llama 7b hf model from the public download site of Hugging Face](https://huggingface.co/decapoda-research/llama-7b-hf/tree/main). Then, download [cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. Next, move the downloaded files to a shared directory on your NFS server.
 
 ### 3. Deploy through Helm Chart
 
 You are allowed to edit and experiment with different parameters in `./kubernetes/values.yaml` to improve finetuning performance and accuracy. For example, you can adjust `trainerNum` and `cpuPerPod` according to node and CPU core numbers in your cluster to make full use of these resources, and different `microBatchSize` result in different training speed and loss (here note that `microBatchSize`×`trainerNum` should not more than 128, as it is the batch size).
 
-**Note: `dataSubPath`, `modelSubPath` and `outputPath` need to have the same names as files under the NFS directory in step 2.**
+**Note: `dataSubPath` and `modelSubPath` need to have the same names as files under the NFS directory in step 2.**
 
 After preparing parameters in `./kubernetes/values.yaml`, submit the job as beflow:
 
@@ -52,7 +52,9 @@ kubectl exec -it <launcher_pod_name> bash -n bigdl-ppml-finetuning # enter launc
 cat launcher.log # display logs collected from other workers
 ```
 
-From the log, you can see whether finetuning process has been invoked successfully in all MPI worker pods, and a progress bar with finetuning speed and estimated time will be showed after some data preprocessing steps (this may take quiet a while). For the fine-tuned model, it is written by the worker 0 (who holds rank 0), so you can find the model output inside the pod or the `output` folder under the NFS path (because it has been mounted to worker 0 as output path).
+From the log, you can see whether finetuning process has been invoked successfully in all MPI worker pods, and a progress bar with finetuning speed and estimated time will be showed after some data preprocessing steps (this may take quiet a while).
+
+For the fine-tuned model, it is written by the worker 0 (who holds rank 0), so you can find the model output inside the pod, which can be saved to host by command tools like `kubectl cp` or `scp`.
 
 
 ## To run in TDX-CoCo and enable Remote Attestation API
diff --git a/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh b/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh
index 1f9873b0..f9008c8d 100644
--- a/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh
+++ b/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh
@@ -8,7 +8,6 @@ if [ "$WORKER_ROLE" = "launcher" ]
 then
   sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile
   export DATA_PATH="/ppml/data/$DATA_SUB_PATH"
-  export SAVE_PATH="/ppml/output"
   sleep 10
   mpirun \
     -n $WORLD_SIZE \
@@ -22,13 +21,13 @@ then
     python /ppml/lora_finetune.py \
       --base_model '/ppml/model/'  \
       --data_path "$DATA_PATH" \
-      --output_dir "$SAVE_PATH/finetuned_model" \
+      --output_dir "/home/mpiuser/finetuned_model" \
       --micro_batch_size $MICRO_BATCH_SIZE \
-      --bf16 > $SAVE_PATH/launcher.log 2>&1
+      --bf16 > /home/mpiuser/launcher.log 2>&1
   exit_status=$?
   if [ $exit_status -ne 0 ];
   then
-    cat $SAVE_PATH/launcher.log
+    cat /home/mpiuser/launcher.log
     exit $exit_status
   else
     while true
diff --git a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml
index 63d50461..4c22b068 100644
--- a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml
+++ b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml
@@ -51,9 +51,6 @@ spec:
              - name: nfs-storage
                subPath: {{ .Values.dataSubPath }}
                mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
-             - name: nfs-storage
-               subPath: {{ .Values.outputSubPath }}
-               mountPath: "/ppml/output"
     Worker:
       replicas: {{ .Values.trainerNum }}
       template:
@@ -86,9 +83,6 @@ spec:
             - name: nfs-storage
               subPath: {{ .Values.dataSubPath }}
               mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
-            - name: nfs-storage
-              subPath: {{ .Values.outputSubPath }}
-              mountPath: "/ppml/output"
             resources:
               requests:
                 cpu: {{ .Values.cpuPerPod }}
@@ -96,4 +90,4 @@ spec:
           - name: nfs-storage
             persistentVolumeClaim:
               claimName: nfs-pvc
-{{- end }}
\ No newline at end of file
+{{- end }}
diff --git a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml
index cd4d260b..ed00ea45 100644
--- a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml
+++ b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml
@@ -71,9 +71,6 @@ spec:
              - name: nfs-storage
                subPath: {{ .Values.dataSubPath }}
                mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
-             - name: nfs-storage
-               subPath: {{ .Values.outputSubPath }}
-               mountPath: "/ppml/output"
              - name: dev
                mountPath: /dev
              {{- if eq .Values.enableTLS true }}
@@ -118,9 +115,6 @@ spec:
             - name: nfs-storage
               subPath: {{ .Values.dataSubPath }}
               mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
-            - name: nfs-storage
-              subPath: {{ .Values.outputSubPath }}
-              mountPath: "/ppml/output"
             - name: dev
               mountPath: /dev
             resources:
diff --git a/docker/llm/finetune/lora/kubernetes/values.yaml b/docker/llm/finetune/lora/kubernetes/values.yaml
index 70691935..92df0493 100644
--- a/docker/llm/finetune/lora/kubernetes/values.yaml
+++ b/docker/llm/finetune/lora/kubernetes/values.yaml
@@ -6,11 +6,10 @@ nfsServerIp: your_nfs_server_ip
 nfsPath: a_nfs_shared_folder_path_on_the_server
 dataSubPath: alpaca_data_cleaned_archive.json # a subpath of the data file under nfs directory
 modelSubPath: llama-7b-hf # a subpath of the model file (dir) under nfs directory
-outputSubPath: output # a subpath of the empty directory under the nfs directory to save finetuned model, for example, if you make an empty dir named 'output' at the nfsPath, the value should be 'output'
 ompNumThreads: 14
 cpuPerPod: 42
 attestionApiServicePort: 9870
 
 enableTLS: false # true or false
 base64ServerCrt: "your_base64_format_server_crt"
-base64ServerKey: "your_base64_format_server_key"
\ No newline at end of file
+base64ServerKey: "your_base64_format_server_key"

From 975da86e00768a7141375ea43c9353b262a72635 Mon Sep 17 00:00:00 2001
From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com>
Date: Mon, 25 Sep 2023 13:03:57 +0800
Subject: [PATCH 10/10] LLM: fix gptneox kv cache (#9044)

---
 python/llm/src/bigdl/llm/transformers/models/gptneox.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/src/bigdl/llm/transformers/models/gptneox.py b/python/llm/src/bigdl/llm/transformers/models/gptneox.py
index 8e31a14a..1f70491f 100644
--- a/python/llm/src/bigdl/llm/transformers/models/gptneox.py
+++ b/python/llm/src/bigdl/llm/transformers/models/gptneox.py
@@ -34,7 +34,7 @@
 import torch
 from typing import Optional, Tuple
 from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache
+from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256