From 9daf543e2f25931ce84e86ecdd3772e03645721c Mon Sep 17 00:00:00 2001 From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com> Date: Thu, 15 Jun 2023 16:28:50 +0800 Subject: [PATCH] LLM: Update convert of gpenox to sync with new libgptneox.so (#8345) --- python/llm/src/bigdl/llm/utils/convert_util.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/llm/src/bigdl/llm/utils/convert_util.py b/python/llm/src/bigdl/llm/utils/convert_util.py index ac4ecebc..78833385 100644 --- a/python/llm/src/bigdl/llm/utils/convert_util.py +++ b/python/llm/src/bigdl/llm/utils/convert_util.py @@ -1276,8 +1276,13 @@ def _convert_gptneox_hf_to_ggml(model_path, outfile_dir, outtype): fout.write(struct.pack("i", ftype)) dot_token = tokenizer.encode(".")[0] + vocab = tokenizer.vocab + id2token = {v: k for k, v in vocab.items()} for i in range(hparams["vocab_size"]): - text = tokenizer.decode([i]).encode('utf-8') + if i in id2token: + text = id2token[i].encode('utf-8') + else: + text = tokenizer.decode([i]).encode('utf-8') fout.write(struct.pack("i", len(text))) fout.write(text)