From f99d348954ee68fd0617554b45c73d81549d671b Mon Sep 17 00:00:00 2001
From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com>
Date: Tue, 20 Jun 2023 13:39:35 +0800
Subject: [PATCH] LLM: convert and quantize support for StarCoder (#8359)

* basic support for starcoder

* update from_pretrained

* fix bug and fix style
---
 python/llm/src/bigdl/llm/ggml/convert.py      |  12 +-
 .../llm/src/bigdl/llm/ggml/convert_model.py   |  12 +-
 python/llm/src/bigdl/llm/ggml/quantize.py     |  15 +-
 .../src/bigdl/llm/ggml/transformers/model.py  |  13 +-
 .../llm/src/bigdl/llm/utils/convert_util.py   | 173 +++++++++++++++++-
 5 files changed, 206 insertions(+), 19 deletions(-)

diff --git a/python/llm/src/bigdl/llm/ggml/convert.py b/python/llm/src/bigdl/llm/ggml/convert.py
index 1814965e..b645c481 100644
--- a/python/llm/src/bigdl/llm/ggml/convert.py
+++ b/python/llm/src/bigdl/llm/ggml/convert.py
@@ -72,6 +72,10 @@ def _convert_bloom(model_path, outfile_dir, outtype):
     _convert_bloom_hf_to_ggml(model_path, outfile_dir, outtype)
 
 
+def _convert_starcoder(model_path, outfile_dir, outtype):
+    _convert_starcoder_hf_to_ggml(model_path, outfile_dir, outtype)
+
+
 def _convert_to_ggml(model_path: str, outfile_dir: str,
                      model_family: str = 'llama', outtype: str="fp16"):
     """
@@ -84,12 +88,12 @@ def _convert_to_ggml(model_path: str, outfile_dir: str,
             For lora finetuned model, the path should be pointed to a merged weight.
     :param outfile_dir: str, the directory to save ggml compatible file, for example `./models`.
     :param model_family: Which model family your input model belongs to. Default to `llama`.
-            Now only `llama`/`bloom`/`gptneox` are supported.
+            Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported.
     :param outtype: specify the output format. Defalut to `fp16`. Now `fp32`/`fp16` are supported.
     """
-    invalidInputError(model_family in ['llama', 'bloom', 'gptneox'],
+    invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'],
                       "Now we only support quantization of model \
-                       family('llama', 'bloom', 'gptneox')",
+                       family('llama', 'bloom', 'gptneox', 'starcoder')",
                       "{} is not in the list.".format(model_family))
     invalidInputError(os.path.exists(model_path),
                       "The file {} was not found".format(model_path))
@@ -108,3 +112,5 @@ def _convert_to_ggml(model_path: str, outfile_dir: str,
         _convert_gptneox(model_path, outfile_dir, outtype)
     if model_family == 'bloom':
         _convert_bloom(model_path, outfile_dir, outtype)
+    if model_family == 'starcoder':
+        _convert_starcoder(model_path, outfile_dir, outtype)
diff --git a/python/llm/src/bigdl/llm/ggml/convert_model.py b/python/llm/src/bigdl/llm/ggml/convert_model.py
index cb04b6fc..495eac3f 100644
--- a/python/llm/src/bigdl/llm/ggml/convert_model.py
+++ b/python/llm/src/bigdl/llm/ggml/convert_model.py
@@ -39,7 +39,7 @@ def convert_model(input_path: str,
     :param output_path: Save path of output quantized model. You must pass a *directory* to
             save all related output.
     :param model_family: Which model family your input model belongs to.
-            Now only `llama`/`bloom`/`gptneox` are supported.
+            Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported.
     :param dtype: Which quantized precision will be converted.
             Now only `int4` and `int8` are supported, and `int8` only works for `llama`
             and `gptneox`.
@@ -53,9 +53,9 @@ def convert_model(input_path: str,
     # make sure directory exists
     os.makedirs(output_path, exist_ok=True)
     # check input value
-    invalidInputError(model_family in ['llama', 'bloom', 'gptneox'],
+    invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'],
                       "Now we only support quantization of model \
-                       family('llama', 'bloom', 'gptneox')",
+                       family('llama', 'bloom', 'gptneox', 'starcoder')",
                       "{} is not in the list.".format(model_family))
     invalidInputError(os.path.isdir(output_path),
                       "The output_path {} was not a directory".format(output_path))
@@ -72,9 +72,9 @@ def convert_model(input_path: str,
         dtype = 'q4_0'
     elif dtype == 'int8':
         dtype = 'q8_0'
-        invalidInputError(model_family in ['llama', 'gptneox'],
+        invalidInputError(model_family in ['llama', 'gptneox', 'starcoder'],
                           "Now we only support int8 quantization of model \
-                          family('llama', 'gptneox')",
+                          family('llama', 'gptneox', 'starcoder')",
                           "{} is not in the list.".format(model_family))
 
     if tmp_path is not None:
@@ -110,7 +110,7 @@ def main():
                         help=("output_path,save path of output quantized model."))
     parser.add_argument('-x', '--model_family', type=str, required=True,
                         help=("model_family: Which model family your input model belongs to."
-                              "Now only `llama`/`bloom`/`gptneox` are supported."))
+                              "Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported."))
     parser.add_argument('-t', '--dtype', type=str, default="int4",
                         help="Which quantized precision will be converted.")
     parser.add_argument('-p', '--tmp_path', type=str, default=None,
diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/bigdl/llm/ggml/quantize.py
index 31f76962..8a20477a 100644
--- a/python/llm/src/bigdl/llm/ggml/quantize.py
+++ b/python/llm/src/bigdl/llm/ggml/quantize.py
@@ -36,10 +36,16 @@ _gptneox_quantize_type = {"q4_0": 2,
                           "q5_0": 8,
                           "q5_1": 9,
                           "q8_0": 7}
+_starcoder_quantize_type = {"q4_0": 2,
+                            "q4_1": 3,
+                            "q5_0": 8,
+                            "q5_1": 9,
+                            "q8_0": 7}
 
 _quantize_type = {"llama": _llama_quantize_type,
                   "bloom": _bloom_quantize_type,
-                  "gptneox": _gptneox_quantize_type}
+                  "gptneox": _gptneox_quantize_type,
+                  "starcoder": _starcoder_quantize_type}
 
 
 def quantize(input_path: str, output_path: str,
@@ -52,19 +58,20 @@ def quantize(input_path: str, output_path: str,
             save all related output. Filename of quantized model will be like
             `bigdl_llm_llama_q4_0.bin`.
     :param model_family: Which model family your input model belongs to.
-            Now only `llama`/`bloom`/`gptneox` are supported.
+            Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported.
     :param dtype: Quantization method which differs in the resulting model disk size and
             inference speed. Defalut to `q4_0`. Difference model family may support
             different types, now the supported list is:
             llama : "q4_0", "q4_1", "q4_2"
             bloom : "q4_0", "q4_1"
             gptneox : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
+            starcoder : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
 
     :return: the path str to the converted ggml binary checkpoint
     """
-    invalidInputError(model_family in ['llama', 'bloom', 'gptneox'],
+    invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'],
                       "Now we only support quantization of model \
-                       family('llama', 'bloom', 'gptneox')",
+                       family('llama', 'bloom', 'gptneox', 'starcoder')",
                       "{} is not in the list.".format(model_family))
     invalidInputError(os.path.isfile(input_path),
                       "The file {} was not found".format(input_path))
diff --git a/python/llm/src/bigdl/llm/ggml/transformers/model.py b/python/llm/src/bigdl/llm/ggml/transformers/model.py
index 43ede645..9fc8557f 100644
--- a/python/llm/src/bigdl/llm/ggml/transformers/model.py
+++ b/python/llm/src/bigdl/llm/ggml/transformers/model.py
@@ -50,10 +50,10 @@ class AutoModelForCausalLM:
                3. a str for huggingface hub repo id.
 
         :param model_family: the model family of the pretrained checkpoint.
-               Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"``.
+               Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"`` and ``"starcoder"``.
         :param dtype: Which quantized precision will be converted.
                 Now only `int4` and `int8` are supported, and `int8` only works for `llama`
-                and `gptneox`.
+                , `gptneox` and `starcoder`.
         :param cache_dir: (optional) this parameter will only be used when
                ``pretrained_model_name_or_path`` is a hugginface checkpoint or hub repo id.
                It indicates the saving path for the converted low precision model.
@@ -63,9 +63,9 @@ class AutoModelForCausalLM:
 
         :return: a model instance
         """
-        invalidInputError(model_family in ['llama', 'gptneox', 'bloom'],
-                          "Now we only support model family: 'llama', 'gptneox', 'bloom', "
-                          "'{}' is not in the list.".format(model_family))
+        invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder'],
+                          "Now we only support model family: 'llama', 'gptneox', 'bloom',"
+                          " 'starcoder', '{}' is not in the list.".format(model_family))
         invalidInputError(dtype.lower() in ['int4', 'int8'],
                           "Now we only support int4 and int8 as date type for weight")
 
@@ -110,3 +110,6 @@ class AutoModelForCausalLM:
         elif model_family == 'bloom':
             from bigdl.llm.ggml.model.bloom import Bloom
             return Bloom(model_path=ggml_model_path, **kwargs)
+        elif model_family == 'starcoder':
+            from bigdl.llm.ggml.model.starcoder import Starcoder
+            return Starcoder(model_path=ggml_model_path, **kwargs)
diff --git a/python/llm/src/bigdl/llm/utils/convert_util.py b/python/llm/src/bigdl/llm/utils/convert_util.py
index 78833385..d00eb76a 100644
--- a/python/llm/src/bigdl/llm/utils/convert_util.py
+++ b/python/llm/src/bigdl/llm/utils/convert_util.py
@@ -87,7 +87,8 @@ __all__ = ['Params',
            'load_vocab',
            'default_outfile',
            '_convert_gptneox_hf_to_ggml',
-           '_convert_bloom_hf_to_ggml']
+           '_convert_bloom_hf_to_ggml',
+           '_convert_starcoder_hf_to_ggml']
 
 
 @dataclass(frozen=True)
@@ -1415,3 +1416,173 @@ def _convert_bloom_hf_to_ggml(model_path, outfile_dir, outtype):
         data.tofile(fout)
 
     fout.close()
+
+
+def _convert_starcoder_hf_to_ggml(model_path, outfile_dir, outtype):
+    from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+    import torch
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    hparams = config.to_dict()
+    model = AutoModelForCausalLM.from_pretrained(model_path, config=config,
+                                                 torch_dtype=torch.float16
+                                                 if outtype == "f16" else torch.float32,
+                                                 low_cpu_mem_usage=True,
+                                                 trust_remote_code=True,
+                                                 offload_state_dict=True)
+
+    list_vars = model.state_dict()
+
+    encoder = tokenizer.vocab
+    # Add added_tokens (special tokens) to the encoder
+    encoder.update(tokenizer.get_added_vocab())
+
+    filestem = Path(model_path).stem
+    fn_out = os.path.join(outfile_dir, f"ggml-{filestem}-{outtype}.bin")
+    fout = open(fn_out, "wb")
+
+    if outtype == "f16":
+        ftype = 1
+    else:
+        ftype = 0
+
+    fout.write(struct.pack("i", 0x67676d6c))  # magic: ggml in hex
+    vocab_size = hparams["vocab_size"]
+    fout.write(struct.pack("i", vocab_size))
+    # fout.write(struct.pack("i", len(encoder)))
+    fout.write(struct.pack("i", hparams["n_positions"]))
+    fout.write(struct.pack("i", hparams["n_embd"]))
+    fout.write(struct.pack("i", hparams["n_head"]))
+    fout.write(struct.pack("i", hparams["n_layer"]))
+    fout.write(struct.pack("i", ftype))
+
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+    fout.write(struct.pack("i", vocab_size))
+
+    counter = 0
+    # sort by value
+    for key in sorted(encoder, key=encoder.get):
+        text = bytearray([byte_decoder[c] for c in key])
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        counter += 1
+
+    # TODO: Repeat last token until vocab_size
+    while counter < vocab_size:
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        counter += 1
+
+    for name in list_vars.keys():
+        data = list_vars[name].squeeze().numpy()
+        print("Processing variable: " + name + " with shape: ", data.shape)
+
+        # rename headers to keep compatibility
+        if name == "transformer.ln_f.weight":
+            name = "model/ln_f/g"
+        elif name == "transformer.ln_f.bias":
+            name = "model/ln_f/b"
+        elif name == "transformer.wte.weight":
+            name = "model/wte"
+        elif name == "transformer.wpe.weight":
+            name = "model/wpe"
+        elif name == "lm_head.weight":
+            name = "model/lm_head"
+        elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_1/g"
+        elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_1/b"
+        elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_attn/w"
+        elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_attn/b"
+        elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_proj/w"
+        elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_proj/b"
+        elif re.match(r"transformer.h.\d+.ln_2.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_2/g"
+        elif re.match(r"transformer.h.\d+.ln_2.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_2/b"
+        elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_fc/w"
+        elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_fc/b"
+        elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_proj/w"
+        elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_proj/b"
+        else:
+            print("Unrecognized variable name. %s", name)
+
+        # we don't need these
+        if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
+            print("  Skipping variable: " + name)
+            continue
+
+        n_dims = len(data.shape)
+
+        ftype_cur = 0
+        if ftype == 1:
+            if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or
+                    name[-2:] == "/w") and n_dims == 2:
+                print("  Converting to float16")
+                data = data.astype(np.float16)
+                ftype_cur = 1
+            else:
+                print("  Converting to float32")
+                data = data.astype(np.float32)
+                ftype_cur = 0
+
+        "model/h.*/attn/c_attn/w"
+        "model/h.*/attn/c_proj/w"
+        "model/h.*/mlp/c_fc/w"
+        "model/h.*/mlp/c_proj/w"
+        if name[-14:] == "/attn/c_attn/w" or name[-14:] == "/attn/c_attn/b":
+            print("  Duplicate K,V heads to use MHA instead of MQA")
+
+            embed_dim = hparams["n_embd"]
+            head_dim = embed_dim // hparams["n_head"]
+
+            # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
+            q, k, v = np.split(data,
+                               (hparams["n_head"] * head_dim,
+                                (hparams["n_head"] + 1) * head_dim),
+                               axis=0)
+            # duplicate k, v along the first axis (head_dim, hidden_dim) ->
+            # (n_heads * head_dim, hidden_dim)
+            if len(k.shape) == 2:
+                k = np.tile(k, (hparams["n_head"], 1))
+                v = np.tile(v, (hparams["n_head"], 1))
+            elif len(k.shape) == 1:
+                k = np.tile(k, (hparams["n_head"]))
+                v = np.tile(v, (hparams["n_head"]))
+            # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) ->
+            # (3 * n_heads * head_dim, hidden_dim)
+            data = np.concatenate((q, k, v), axis=0)
+
+        # header
+        str = name.encode('utf-8')
+        fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+        for i in range(n_dims):
+            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+        fout.write(str)
+
+        # data
+        data.tofile(fout)
+
+    fout.close()