From d0d9c9d636090ec76ebdef60342c3c9d34add8d7 Mon Sep 17 00:00:00 2001 From: Yishuo Wang Date: Fri, 7 Feb 2025 11:21:29 +0800 Subject: [PATCH] remove load_in_8bit usage as it is not supported a long time ago (#12779) --- python/llm/dev/benchmark/harness/README.md | 12 ++++++------ .../GPU/LLM-Finetuning/axolotl/llama3-qlora.yml | 1 - .../llm/example/GPU/LLM-Finetuning/axolotl/lora.yml | 1 - .../llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml | 1 - .../gen_ea_answer_llama2chat_e2_ipex_optimize.py | 7 +++---- python/llm/src/ipex_llm/transformers/model.py | 1 - python/llm/src/ipex_llm/transformers/npu_model.py | 1 - 7 files changed, 9 insertions(+), 15 deletions(-) diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md index 69a78209..ccb0960c 100644 --- a/python/llm/dev/benchmark/harness/README.md +++ b/python/llm/dev/benchmark/harness/README.md @@ -1,5 +1,5 @@ # Harness Evaluation -[Harness evaluation](https://github.com/EleutherAI/lm-evaluation-harness) allows users to eaisly get accuracy on various datasets. Here we have enabled harness evaluation with IPEX-LLM under +[Harness evaluation](https://github.com/EleutherAI/lm-evaluation-harness) allows users to eaisly get accuracy on various datasets. Here we have enabled harness evaluation with IPEX-LLM under [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) settings. Before running, make sure to have [ipex-llm](../../../README.md) installed. @@ -53,21 +53,21 @@ AutoModelForCausalLM.from_pretrained = partial(AutoModelForCausalLM.from_pretrai ``` to the following codes to load the low bit models. ```python -class ModifiedAutoModelForCausalLM(AutoModelForCausalLM): +class ModifiedAutoModelForCausalLM(AutoModelForCausalLM): @classmethod def load_low_bit(cls,*args,**kwargs): - for k in ['load_in_low_bit', 'device_map', 'max_memory', 'load_in_8bit','load_in_4bit']: + for k in ['load_in_low_bit', 'device_map', 'max_memory','load_in_4bit']: kwargs.pop(k) return super().load_low_bit(*args, **kwargs) AutoModelForCausalLM.from_pretrained=partial(ModifiedAutoModelForCausalLM.load_low_bit, *self.bigdl_llm_kwargs) ``` ### 2.Please pass the argument `trust_remote_code=True` to allow custom code to be run. -`lm-evaluation-harness` doesn't pass `trust_remote_code=true` argument to datasets. This may cause errors similar to the following one: +`lm-evaluation-harness` doesn't pass `trust_remote_code=true` argument to datasets. This may cause errors similar to the following one: ``` -RuntimeError: Job config of task=winogrande, precision=sym_int4 failed. +RuntimeError: Job config of task=winogrande, precision=sym_int4 failed. Error Message: The repository for winogrande contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/winogrande. -please pass the argument trust_remote_code=True to allow custom code to be run. +please pass the argument trust_remote_code=True to allow custom code to be run. ``` Please refer to these: diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml b/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml index 401f4c10..03ccc159 100644 --- a/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml +++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml @@ -3,7 +3,6 @@ base_model: meta-llama/Meta-Llama-3-8B model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer -load_in_8bit: false load_in_4bit: true strict: false diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml b/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml index b77612c7..094f84b4 100644 --- a/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml +++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml @@ -3,7 +3,6 @@ base_model: NousResearch/Llama-2-7b-hf model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer -load_in_8bit: false load_in_4bit: true strict: false diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml index b18efd4e..606f2650 100644 --- a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml +++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml @@ -4,7 +4,6 @@ model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer is_llama_derived_model: true -load_in_8bit: false load_in_4bit: true strict: false diff --git a/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py b/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py index 091bef31..95f90e2a 100644 --- a/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py +++ b/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py @@ -312,7 +312,6 @@ def get_model_answers( torch_dtype=torch.float16, # torch_dtype=torch.float32, low_cpu_mem_usage=True, - # load_in_8bit=True, total_token=args.total_token, depth=args.depth, top_k=args.top_k, @@ -384,7 +383,7 @@ def get_model_answers( ] if len(stop_token_ids_index) > 0: output_ids = output_ids[: stop_token_ids_index[0]] - + output = tokenizer.decode( output_ids, spaces_between_special_tokens=False, @@ -572,8 +571,8 @@ if __name__ == "__main__": ) parser.add_argument( - "--enable-ipex-llm", - action='store_true', + "--enable-ipex-llm", + action='store_true', help="Enable ipex-llm optimization" ) args = parser.parse_args() diff --git a/python/llm/src/ipex_llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py index 788a2edb..971e4349 100644 --- a/python/llm/src/ipex_llm/transformers/model.py +++ b/python/llm/src/ipex_llm/transformers/model.py @@ -233,7 +233,6 @@ class _BaseAutoModelClass: optimize_model = False kwargs["modules_to_not_convert"] = ["lm_head"] - load_in_8bit = kwargs.pop("load_in_8bit", False) from ipex_llm.llm_patching import bigdl_patched if bigdl_patched == 'Train': global patched_training_mode diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index ae6d7a73..725fff86 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -117,7 +117,6 @@ class _BaseAutoModelClass: # ignore following arguments ignore_argument(kwargs, "model_hub") ignore_argument(kwargs, "load_in_4bit") - ignore_argument(kwargs, "load_in_8bit") ignore_argument(kwargs, "imatrix") ignore_argument(kwargs, "cpu_embedding") ignore_argument(kwargs, "embedding_qtype")