From 9daf543e2f25931ce84e86ecdd3772e03645721c Mon Sep 17 00:00:00 2001
From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com>
Date: Thu, 15 Jun 2023 16:28:50 +0800
Subject: [PATCH] LLM: Update convert of gpenox to sync with new libgptneox.so
 (#8345)

---
 python/llm/src/bigdl/llm/utils/convert_util.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/llm/src/bigdl/llm/utils/convert_util.py b/python/llm/src/bigdl/llm/utils/convert_util.py
index ac4ecebc..78833385 100644
--- a/python/llm/src/bigdl/llm/utils/convert_util.py
+++ b/python/llm/src/bigdl/llm/utils/convert_util.py
@@ -1276,8 +1276,13 @@ def _convert_gptneox_hf_to_ggml(model_path, outfile_dir, outtype):
     fout.write(struct.pack("i", ftype))
 
     dot_token = tokenizer.encode(".")[0]
+    vocab = tokenizer.vocab
+    id2token = {v: k for k, v in vocab.items()}
     for i in range(hparams["vocab_size"]):
-        text = tokenizer.decode([i]).encode('utf-8')
+        if i in id2token:
+            text = id2token[i].encode('utf-8')
+        else:
+            text = tokenizer.decode([i]).encode('utf-8')
         fout.write(struct.pack("i", len(text)))
         fout.write(text)