remove load_in_8bit usage as it is not supported a long time ago (#12779)

2025-02-07 11:21:29 +08:00 · 2025-02-07 11:21:29 +08:00 · d0d9c9d636
commit d0d9c9d636
parent 9e9b6c9f2b
7 changed files with 9 additions and 15 deletions
--- a/python/llm/dev/benchmark/harness/README.md
+++ b/python/llm/dev/benchmark/harness/README.md
@ -1,5 +1,5 @@
 # Harness Evaluation
-[Harness evaluation](https://github.com/EleutherAI/lm-evaluation-harness) allows users to eaisly get accuracy on various datasets. Here we have enabled harness evaluation with IPEX-LLM under 
+[Harness evaluation](https://github.com/EleutherAI/lm-evaluation-harness) allows users to eaisly get accuracy on various datasets. Here we have enabled harness evaluation with IPEX-LLM under
 [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) settings.
 Before running, make sure to have [ipex-llm](../../../README.md) installed.

@ -53,21 +53,21 @@ AutoModelForCausalLM.from_pretrained = partial(AutoModelForCausalLM.from_pretrai
 ```
 to the following codes to load the low bit models.
 ```python
-class ModifiedAutoModelForCausalLM(AutoModelForCausalLM): 
+class ModifiedAutoModelForCausalLM(AutoModelForCausalLM):
    @classmethod
    def load_low_bit(cls,*args,**kwargs):
-        for k in ['load_in_low_bit', 'device_map', 'max_memory', 'load_in_8bit','load_in_4bit']: 
+        for k in ['load_in_low_bit', 'device_map', 'max_memory','load_in_4bit']:
        kwargs.pop(k)
    return super().load_low_bit(*args, **kwargs)

 AutoModelForCausalLM.from_pretrained=partial(ModifiedAutoModelForCausalLM.load_low_bit, *self.bigdl_llm_kwargs)
 ```
 ### 2.Please pass the argument `trust_remote_code=True` to allow custom code to be run.
-`lm-evaluation-harness` doesn't pass `trust_remote_code=true` argument to datasets. This may cause errors similar to the following one: 
+`lm-evaluation-harness` doesn't pass `trust_remote_code=true` argument to datasets. This may cause errors similar to the following one:
 ```
-RuntimeError: Job config of task=winogrande, precision=sym_int4 failed. 
+RuntimeError: Job config of task=winogrande, precision=sym_int4 failed.
 Error Message: The repository for winogrande contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/winogrande.
-please pass the argument trust_remote_code=True to allow custom code to be run. 
+please pass the argument trust_remote_code=True to allow custom code to be run.
 ```
 Please refer to these:

--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml
@ -3,7 +3,6 @@ base_model: meta-llama/Meta-Llama-3-8B
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer

-load_in_8bit: false
 load_in_4bit: true
 strict: false

--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml
@ -3,7 +3,6 @@ base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer

-load_in_8bit: false
 load_in_4bit: true
 strict: false

--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 is_llama_derived_model: true

-load_in_8bit: false
 load_in_4bit: true
 strict: false

--- a/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py
+++ b/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py
@ -312,7 +312,6 @@ def get_model_answers(
            torch_dtype=torch.float16,
            # torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
-            # load_in_8bit=True,
            total_token=args.total_token,
            depth=args.depth,
            top_k=args.top_k,
@ -384,7 +383,7 @@ def get_model_answers(
                ]
                if len(stop_token_ids_index) > 0:
                    output_ids = output_ids[: stop_token_ids_index[0]]
-    
+
            output = tokenizer.decode(
                output_ids,
                spaces_between_special_tokens=False,
@ -572,8 +571,8 @@ if __name__ == "__main__":
    )

    parser.add_argument(
-        "--enable-ipex-llm", 
-        action='store_true', 
+        "--enable-ipex-llm",
+        action='store_true',
        help="Enable ipex-llm optimization"
    )
    args = parser.parse_args()
--- a/python/llm/src/ipex_llm/transformers/model.py
+++ b/python/llm/src/ipex_llm/transformers/model.py
@ -233,7 +233,6 @@ class _BaseAutoModelClass:
            optimize_model = False
            kwargs["modules_to_not_convert"] = ["lm_head"]

-        load_in_8bit = kwargs.pop("load_in_8bit", False)
        from ipex_llm.llm_patching import bigdl_patched
        if bigdl_patched == 'Train':
            global patched_training_mode
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@ -117,7 +117,6 @@ class _BaseAutoModelClass:
        # ignore following arguments
        ignore_argument(kwargs, "model_hub")
        ignore_argument(kwargs, "load_in_4bit")
-        ignore_argument(kwargs, "load_in_8bit")
        ignore_argument(kwargs, "imatrix")
        ignore_argument(kwargs, "cpu_embedding")
        ignore_argument(kwargs, "embedding_qtype")