remove load_in_8bit usage as it is not supported a long time ago (#12779)

2025-02-07 11:21:29 +08:00 · 2025-02-07 11:21:29 +08:00 · d0d9c9d636
commit d0d9c9d636
parent 9e9b6c9f2b
7 changed files with 9 additions and 15 deletions
--- a/python/llm/dev/benchmark/harness/README.md
+++ b/python/llm/dev/benchmark/harness/README.md
@ -56,7 +56,7 @@ to the following codes to load the low bit models.
 class ModifiedAutoModelForCausalLM(AutoModelForCausalLM):
    @classmethod
    def load_low_bit(cls,*args,**kwargs):
-        for k in ['load_in_low_bit', 'device_map', 'max_memory', 'load_in_8bit','load_in_4bit']: 
+        for k in ['load_in_low_bit', 'device_map', 'max_memory','load_in_4bit']:
        kwargs.pop(k)
    return super().load_low_bit(*args, **kwargs)

--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml
@ -3,7 +3,6 @@ base_model: meta-llama/Meta-Llama-3-8B
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer

-load_in_8bit: false
 load_in_4bit: true
 strict: false

--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml
@ -3,7 +3,6 @@ base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer

-load_in_8bit: false
 load_in_4bit: true
 strict: false

--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 is_llama_derived_model: true

-load_in_8bit: false
 load_in_4bit: true
 strict: false

--- a/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py
+++ b/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat_e2_ipex_optimize.py
@ -312,7 +312,6 @@ def get_model_answers(
            torch_dtype=torch.float16,
            # torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
-            # load_in_8bit=True,
            total_token=args.total_token,
            depth=args.depth,
            top_k=args.top_k,
--- a/python/llm/src/ipex_llm/transformers/model.py
+++ b/python/llm/src/ipex_llm/transformers/model.py
@ -233,7 +233,6 @@ class _BaseAutoModelClass:
            optimize_model = False
            kwargs["modules_to_not_convert"] = ["lm_head"]

-        load_in_8bit = kwargs.pop("load_in_8bit", False)
        from ipex_llm.llm_patching import bigdl_patched
        if bigdl_patched == 'Train':
            global patched_training_mode
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@ -117,7 +117,6 @@ class _BaseAutoModelClass:
        # ignore following arguments
        ignore_argument(kwargs, "model_hub")
        ignore_argument(kwargs, "load_in_4bit")
-        ignore_argument(kwargs, "load_in_8bit")
        ignore_argument(kwargs, "imatrix")
        ignore_argument(kwargs, "cpu_embedding")
        ignore_argument(kwargs, "embedding_qtype")