From d0d9c9d636090ec76ebdef60342c3c9d34add8d7 Mon Sep 17 00:00:00 2001
From: Yishuo Wang <yishuo.wang@intel.com>
Date: Fri, 7 Feb 2025 11:21:29 +0800
Subject: [PATCH] remove load_in_8bit usage as it is not supported a long time
 ago (#12779)

---
 python/llm/dev/benchmark/harness/README.md           | 12 ++++++------
 .../GPU/LLM-Finetuning/axolotl/llama3-qlora.yml      |  1 -
 .../llm/example/GPU/LLM-Finetuning/axolotl/lora.yml  |  1 -
 .../llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml |  1 -
 .../gen_ea_answer_llama2chat_e2_ipex_optimize.py     |  7 +++----
 python/llm/src/ipex_llm/transformers/model.py        |  1 -
 python/llm/src/ipex_llm/transformers/npu_model.py    |  1 -
 7 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md
index 69a78209..ccb0960c 100644
--- a/python/llm/dev/benchmark/harness/README.md
+++ b/python/llm/dev/benchmark/harness/README.md
@@ -1,5 +1,5 @@
 # Harness Evaluation
-[Harness evaluation](https://github.com/EleutherAI/lm-evaluation-harness) allows users to eaisly get accuracy on various datasets. Here we have enabled harness evaluation with IPEX-LLM under 
+[Harness evaluation](https://github.com/EleutherAI/lm-evaluation-harness) allows users to eaisly get accuracy on various datasets. Here we have enabled harness evaluation with IPEX-LLM under
 [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) settings.
 Before running, make sure to have [ipex-llm](../../../README.md) installed.
 
@@ -53,21 +53,21 @@ AutoModelForCausalLM.from_pretrained = partial(AutoModelForCausalLM.from_pretrai
 ```
 to the following codes to load the low bit models.
 ```python
-class ModifiedAutoModelForCausalLM(AutoModelForCausalLM): 
+class ModifiedAutoModelForCausalLM(AutoModelForCausalLM):
     @classmethod
     def load_low_bit(cls,*args,**kwargs):
-        for k in ['load_in_low_bit', 'device_map', 'max_memory', 'load_in_8bit','load_in_4bit']: 
+        for k in ['load_in_low_bit', 'device_map', 'max_memory','load_in_4bit']:
         kwargs.pop(k)
     return super().load_low_bit(*args, **kwargs)
 
 AutoModelForCausalLM.from_pretrained=partial(ModifiedAutoModelForCausalLM.load_low_bit, *self.bigdl_llm_kwargs)
 ```
 ### 2.Please pass the argument `trust_remote_code=True` to allow custom code to be run.
-`lm-evaluation-harness` doesn't pass `trust_remote_code=true` argument to datasets. This may cause errors similar to the following one: 
+`lm-evaluation-harness` doesn't pass `trust_remote_code=true` argument to datasets. This may cause errors similar to the following one:
 ```
-RuntimeError: Job config of task=winogrande, precision=sym_int4 failed. 
+RuntimeError: Job config of task=winogrande, precision=sym_int4 failed.
 Error Message: The repository for winogrande contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/winogrande.
-please pass the argument trust_remote_code=True to allow custom code to be run. 
+please pass the argument trust_remote_code=True to allow custom code to be run.
 ```
 Please refer to these:
 
diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml b/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml
index 401f4c10..03ccc159 100644
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml
@@ -3,7 +3,6 @@ base_model: meta-llama/Meta-Llama-3-8B
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 
-load_in_8bit: false
 load_in_4bit: true
 strict: false
 
diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml b/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml
index b77612c7..094f84b4 100644
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml
@@ -3,7 +3,6 @@ base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 
-load_in_8bit: false
 load_in_4bit: true
 strict: false
 
diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
index b18efd4e..606f2650 100644
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
@@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 is_llama_derived_model: true
 
-load_in_8bit: false
 load_in_4bit: true
 strict: false
 
diff --git a/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py b/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py
index 091bef31..95f90e2a 100644
--- a/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py
+++ b/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py
@@ -312,7 +312,6 @@ def get_model_answers(
             torch_dtype=torch.float16,
             # torch_dtype=torch.float32,
             low_cpu_mem_usage=True,
-            # load_in_8bit=True,
             total_token=args.total_token,
             depth=args.depth,
             top_k=args.top_k,
@@ -384,7 +383,7 @@ def get_model_answers(
                 ]
                 if len(stop_token_ids_index) > 0:
                     output_ids = output_ids[: stop_token_ids_index[0]]
-    
+
             output = tokenizer.decode(
                 output_ids,
                 spaces_between_special_tokens=False,
@@ -572,8 +571,8 @@ if __name__ == "__main__":
     )
 
     parser.add_argument(
-        "--enable-ipex-llm", 
-        action='store_true', 
+        "--enable-ipex-llm",
+        action='store_true',
         help="Enable ipex-llm optimization"
     )
     args = parser.parse_args()
diff --git a/python/llm/src/ipex_llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py
index 788a2edb..971e4349 100644
--- a/python/llm/src/ipex_llm/transformers/model.py
+++ b/python/llm/src/ipex_llm/transformers/model.py
@@ -233,7 +233,6 @@ class _BaseAutoModelClass:
             optimize_model = False
             kwargs["modules_to_not_convert"] = ["lm_head"]
 
-        load_in_8bit = kwargs.pop("load_in_8bit", False)
         from ipex_llm.llm_patching import bigdl_patched
         if bigdl_patched == 'Train':
             global patched_training_mode
diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index ae6d7a73..725fff86 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -117,7 +117,6 @@ class _BaseAutoModelClass:
         # ignore following arguments
         ignore_argument(kwargs, "model_hub")
         ignore_argument(kwargs, "load_in_4bit")
-        ignore_argument(kwargs, "load_in_8bit")
         ignore_argument(kwargs, "imatrix")
         ignore_argument(kwargs, "cpu_embedding")
         ignore_argument(kwargs, "embedding_qtype")