LLM: convert and quantize support for StarCoder (#8359)

* basic support for starcoder * update from_pretrained * fix bug and fix style
2023-06-20 13:39:35 +08:00 · 2023-06-20 13:39:35 +08:00 · f99d348954
commit f99d348954
parent 5f4f399ca7
5 changed files with 206 additions and 19 deletions
--- a/python/llm/src/bigdl/llm/ggml/convert.py
+++ b/python/llm/src/bigdl/llm/ggml/convert.py
@ -72,6 +72,10 @@ def _convert_bloom(model_path, outfile_dir, outtype):
    _convert_bloom_hf_to_ggml(model_path, outfile_dir, outtype)


+def _convert_starcoder(model_path, outfile_dir, outtype):
+    _convert_starcoder_hf_to_ggml(model_path, outfile_dir, outtype)
+
+
 def _convert_to_ggml(model_path: str, outfile_dir: str,
                     model_family: str = 'llama', outtype: str="fp16"):
    """
@ -84,12 +88,12 @@ def _convert_to_ggml(model_path: str, outfile_dir: str,
            For lora finetuned model, the path should be pointed to a merged weight.
    :param outfile_dir: str, the directory to save ggml compatible file, for example `./models`.
    :param model_family: Which model family your input model belongs to. Default to `llama`.
-            Now only `llama`/`bloom`/`gptneox` are supported.
+            Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported.
    :param outtype: specify the output format. Defalut to `fp16`. Now `fp32`/`fp16` are supported.
    """
-    invalidInputError(model_family in ['llama', 'bloom', 'gptneox'],
+    invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'],
                      "Now we only support quantization of model \
-                       family('llama', 'bloom', 'gptneox')",
+                       family('llama', 'bloom', 'gptneox', 'starcoder')",
                      "{} is not in the list.".format(model_family))
    invalidInputError(os.path.exists(model_path),
                      "The file {} was not found".format(model_path))
@ -108,3 +112,5 @@ def _convert_to_ggml(model_path: str, outfile_dir: str,
        _convert_gptneox(model_path, outfile_dir, outtype)
    if model_family == 'bloom':
        _convert_bloom(model_path, outfile_dir, outtype)
+    if model_family == 'starcoder':
+        _convert_starcoder(model_path, outfile_dir, outtype)
--- a/python/llm/src/bigdl/llm/ggml/convert_model.py
+++ b/python/llm/src/bigdl/llm/ggml/convert_model.py
@ -39,7 +39,7 @@ def convert_model(input_path: str,
    :param output_path: Save path of output quantized model. You must pass a *directory* to
            save all related output.
    :param model_family: Which model family your input model belongs to.
-            Now only `llama`/`bloom`/`gptneox` are supported.
+            Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported.
    :param dtype: Which quantized precision will be converted.
            Now only `int4` and `int8` are supported, and `int8` only works for `llama`
            and `gptneox`.
@ -53,9 +53,9 @@ def convert_model(input_path: str,
    # make sure directory exists
    os.makedirs(output_path, exist_ok=True)
    # check input value
-    invalidInputError(model_family in ['llama', 'bloom', 'gptneox'],
+    invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'],
                      "Now we only support quantization of model \
-                       family('llama', 'bloom', 'gptneox')",
+                       family('llama', 'bloom', 'gptneox', 'starcoder')",
                      "{} is not in the list.".format(model_family))
    invalidInputError(os.path.isdir(output_path),
                      "The output_path {} was not a directory".format(output_path))
@ -72,9 +72,9 @@ def convert_model(input_path: str,
        dtype = 'q4_0'
    elif dtype == 'int8':
        dtype = 'q8_0'
-        invalidInputError(model_family in ['llama', 'gptneox'],
+        invalidInputError(model_family in ['llama', 'gptneox', 'starcoder'],
                          "Now we only support int8 quantization of model \
-                          family('llama', 'gptneox')",
+                          family('llama', 'gptneox', 'starcoder')",
                          "{} is not in the list.".format(model_family))

    if tmp_path is not None:
@ -110,7 +110,7 @@ def main():
                        help=("output_path,save path of output quantized model."))
    parser.add_argument('-x', '--model_family', type=str, required=True,
                        help=("model_family: Which model family your input model belongs to."
-                              "Now only `llama`/`bloom`/`gptneox` are supported."))
+                              "Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported."))
    parser.add_argument('-t', '--dtype', type=str, default="int4",
                        help="Which quantized precision will be converted.")
    parser.add_argument('-p', '--tmp_path', type=str, default=None,
--- a/python/llm/src/bigdl/llm/ggml/quantize.py
+++ b/python/llm/src/bigdl/llm/ggml/quantize.py
@ -36,10 +36,16 @@ _gptneox_quantize_type = {"q4_0": 2,
                          "q5_0": 8,
                          "q5_1": 9,
                          "q8_0": 7}
+_starcoder_quantize_type = {"q4_0": 2,
+                            "q4_1": 3,
+                            "q5_0": 8,
+                            "q5_1": 9,
+                            "q8_0": 7}

 _quantize_type = {"llama": _llama_quantize_type,
                  "bloom": _bloom_quantize_type,
-                  "gptneox": _gptneox_quantize_type}
+                  "gptneox": _gptneox_quantize_type,
+                  "starcoder": _starcoder_quantize_type}


 def quantize(input_path: str, output_path: str,
@ -52,19 +58,20 @@ def quantize(input_path: str, output_path: str,
            save all related output. Filename of quantized model will be like
            `bigdl_llm_llama_q4_0.bin`.
    :param model_family: Which model family your input model belongs to.
-            Now only `llama`/`bloom`/`gptneox` are supported.
+            Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported.
    :param dtype: Quantization method which differs in the resulting model disk size and
            inference speed. Defalut to `q4_0`. Difference model family may support
            different types, now the supported list is:
            llama : "q4_0", "q4_1", "q4_2"
            bloom : "q4_0", "q4_1"
            gptneox : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
+            starcoder : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"

    :return: the path str to the converted ggml binary checkpoint
    """
-    invalidInputError(model_family in ['llama', 'bloom', 'gptneox'],
+    invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'],
                      "Now we only support quantization of model \
-                       family('llama', 'bloom', 'gptneox')",
+                       family('llama', 'bloom', 'gptneox', 'starcoder')",
                      "{} is not in the list.".format(model_family))
    invalidInputError(os.path.isfile(input_path),
                      "The file {} was not found".format(input_path))
--- a/python/llm/src/bigdl/llm/ggml/transformers/model.py
+++ b/python/llm/src/bigdl/llm/ggml/transformers/model.py
@ -50,10 +50,10 @@ class AutoModelForCausalLM:
               3. a str for huggingface hub repo id.

        :param model_family: the model family of the pretrained checkpoint.
-               Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"``.
+               Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"`` and ``"starcoder"``.
        :param dtype: Which quantized precision will be converted.
                Now only `int4` and `int8` are supported, and `int8` only works for `llama`
-                and `gptneox`.
+                , `gptneox` and `starcoder`.
        :param cache_dir: (optional) this parameter will only be used when
               ``pretrained_model_name_or_path`` is a hugginface checkpoint or hub repo id.
               It indicates the saving path for the converted low precision model.
@ -63,9 +63,9 @@ class AutoModelForCausalLM:

        :return: a model instance
        """
-        invalidInputError(model_family in ['llama', 'gptneox', 'bloom'],
+        invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder'],
                          "Now we only support model family: 'llama', 'gptneox', 'bloom',"
-                          "'{}' is not in the list.".format(model_family))
+                          " 'starcoder', '{}' is not in the list.".format(model_family))
        invalidInputError(dtype.lower() in ['int4', 'int8'],
                          "Now we only support int4 and int8 as date type for weight")

@ -110,3 +110,6 @@ class AutoModelForCausalLM:
        elif model_family == 'bloom':
            from bigdl.llm.ggml.model.bloom import Bloom
            return Bloom(model_path=ggml_model_path, **kwargs)
+        elif model_family == 'starcoder':
+            from bigdl.llm.ggml.model.starcoder import Starcoder
+            return Starcoder(model_path=ggml_model_path, **kwargs)
--- a/python/llm/src/bigdl/llm/utils/convert_util.py
+++ b/python/llm/src/bigdl/llm/utils/convert_util.py
@ -87,7 +87,8 @@ __all__ = ['Params',
           'load_vocab',
           'default_outfile',
           '_convert_gptneox_hf_to_ggml',
-           '_convert_bloom_hf_to_ggml']
+           '_convert_bloom_hf_to_ggml',
+           '_convert_starcoder_hf_to_ggml']


@dataclass(frozen=True)
@ -1415,3 +1416,173 @@ def _convert_bloom_hf_to_ggml(model_path, outfile_dir, outtype):
        data.tofile(fout)

    fout.close()
+
+
+def _convert_starcoder_hf_to_ggml(model_path, outfile_dir, outtype):
+    from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+    import torch
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    hparams = config.to_dict()
+    model = AutoModelForCausalLM.from_pretrained(model_path, config=config,
+                                                 torch_dtype=torch.float16
+                                                 if outtype == "f16" else torch.float32,
+                                                 low_cpu_mem_usage=True,
+                                                 trust_remote_code=True,
+                                                 offload_state_dict=True)
+
+    list_vars = model.state_dict()
+
+    encoder = tokenizer.vocab
+    # Add added_tokens (special tokens) to the encoder
+    encoder.update(tokenizer.get_added_vocab())
+
+    filestem = Path(model_path).stem
+    fn_out = os.path.join(outfile_dir, f"ggml-{filestem}-{outtype}.bin")
+    fout = open(fn_out, "wb")
+
+    if outtype == "f16":
+        ftype = 1
+    else:
+        ftype = 0
+
+    fout.write(struct.pack("i", 0x67676d6c))  # magic: ggml in hex
+    vocab_size = hparams["vocab_size"]
+    fout.write(struct.pack("i", vocab_size))
+    # fout.write(struct.pack("i", len(encoder)))
+    fout.write(struct.pack("i", hparams["n_positions"]))
+    fout.write(struct.pack("i", hparams["n_embd"]))
+    fout.write(struct.pack("i", hparams["n_head"]))
+    fout.write(struct.pack("i", hparams["n_layer"]))
+    fout.write(struct.pack("i", ftype))
+
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+    fout.write(struct.pack("i", vocab_size))
+
+    counter = 0
+    # sort by value
+    for key in sorted(encoder, key=encoder.get):
+        text = bytearray([byte_decoder[c] for c in key])
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        counter += 1
+
+    # TODO: Repeat last token until vocab_size
+    while counter < vocab_size:
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        counter += 1
+
+    for name in list_vars.keys():
+        data = list_vars[name].squeeze().numpy()
+        print("Processing variable: " + name + " with shape: ", data.shape)
+
+        # rename headers to keep compatibility
+        if name == "transformer.ln_f.weight":
+            name = "model/ln_f/g"
+        elif name == "transformer.ln_f.bias":
+            name = "model/ln_f/b"
+        elif name == "transformer.wte.weight":
+            name = "model/wte"
+        elif name == "transformer.wpe.weight":
+            name = "model/wpe"
+        elif name == "lm_head.weight":
+            name = "model/lm_head"
+        elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_1/g"
+        elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_1/b"
+        elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_attn/w"
+        elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_attn/b"
+        elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_proj/w"
+        elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_proj/b"
+        elif re.match(r"transformer.h.\d+.ln_2.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_2/g"
+        elif re.match(r"transformer.h.\d+.ln_2.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_2/b"
+        elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_fc/w"
+        elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_fc/b"
+        elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_proj/w"
+        elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_proj/b"
+        else:
+            print("Unrecognized variable name. %s", name)
+
+        # we don't need these
+        if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
+            print("  Skipping variable: " + name)
+            continue
+
+        n_dims = len(data.shape)
+
+        ftype_cur = 0
+        if ftype == 1:
+            if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or
+                    name[-2:] == "/w") and n_dims == 2:
+                print("  Converting to float16")
+                data = data.astype(np.float16)
+                ftype_cur = 1
+            else:
+                print("  Converting to float32")
+                data = data.astype(np.float32)
+                ftype_cur = 0
+
+        "model/h.*/attn/c_attn/w"
+        "model/h.*/attn/c_proj/w"
+        "model/h.*/mlp/c_fc/w"
+        "model/h.*/mlp/c_proj/w"
+        if name[-14:] == "/attn/c_attn/w" or name[-14:] == "/attn/c_attn/b":
+            print("  Duplicate K,V heads to use MHA instead of MQA")
+
+            embed_dim = hparams["n_embd"]
+            head_dim = embed_dim // hparams["n_head"]
+
+            # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
+            q, k, v = np.split(data,
+                               (hparams["n_head"] * head_dim,
+                                (hparams["n_head"] + 1) * head_dim),
+                               axis=0)
+            # duplicate k, v along the first axis (head_dim, hidden_dim) ->
+            # (n_heads * head_dim, hidden_dim)
+            if len(k.shape) == 2:
+                k = np.tile(k, (hparams["n_head"], 1))
+                v = np.tile(v, (hparams["n_head"], 1))
+            elif len(k.shape) == 1:
+                k = np.tile(k, (hparams["n_head"]))
+                v = np.tile(v, (hparams["n_head"]))
+            # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) ->
+            # (3 * n_heads * head_dim, hidden_dim)
+            data = np.concatenate((q, k, v), axis=0)
+
+        # header
+        str = name.encode('utf-8')
+        fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+        for i in range(n_dims):
+            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+        fout.write(str)
+
+        # data
+        data.tofile(fout)
+
+    fout.close()