remove load_in_8bit usage as it is not supported a long time ago (#12779)
This commit is contained in:
		
							parent
							
								
									9e9b6c9f2b
								
							
						
					
					
						commit
						d0d9c9d636
					
				
					 7 changed files with 9 additions and 15 deletions
				
			
		| 
						 | 
					@ -56,7 +56,7 @@ to the following codes to load the low bit models.
 | 
				
			||||||
class ModifiedAutoModelForCausalLM(AutoModelForCausalLM):
 | 
					class ModifiedAutoModelForCausalLM(AutoModelForCausalLM):
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def load_low_bit(cls,*args,**kwargs):
 | 
					    def load_low_bit(cls,*args,**kwargs):
 | 
				
			||||||
        for k in ['load_in_low_bit', 'device_map', 'max_memory', 'load_in_8bit','load_in_4bit']: 
 | 
					        for k in ['load_in_low_bit', 'device_map', 'max_memory','load_in_4bit']:
 | 
				
			||||||
        kwargs.pop(k)
 | 
					        kwargs.pop(k)
 | 
				
			||||||
    return super().load_low_bit(*args, **kwargs)
 | 
					    return super().load_low_bit(*args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,6 @@ base_model: meta-llama/Meta-Llama-3-8B
 | 
				
			||||||
model_type: AutoModelForCausalLM
 | 
					model_type: AutoModelForCausalLM
 | 
				
			||||||
tokenizer_type: AutoTokenizer
 | 
					tokenizer_type: AutoTokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
load_in_8bit: false
 | 
					 | 
				
			||||||
load_in_4bit: true
 | 
					load_in_4bit: true
 | 
				
			||||||
strict: false
 | 
					strict: false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,6 @@ base_model: NousResearch/Llama-2-7b-hf
 | 
				
			||||||
model_type: LlamaForCausalLM
 | 
					model_type: LlamaForCausalLM
 | 
				
			||||||
tokenizer_type: LlamaTokenizer
 | 
					tokenizer_type: LlamaTokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
load_in_8bit: false
 | 
					 | 
				
			||||||
load_in_4bit: true
 | 
					load_in_4bit: true
 | 
				
			||||||
strict: false
 | 
					strict: false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
 | 
				
			||||||
tokenizer_type: LlamaTokenizer
 | 
					tokenizer_type: LlamaTokenizer
 | 
				
			||||||
is_llama_derived_model: true
 | 
					is_llama_derived_model: true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
load_in_8bit: false
 | 
					 | 
				
			||||||
load_in_4bit: true
 | 
					load_in_4bit: true
 | 
				
			||||||
strict: false
 | 
					strict: false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -312,7 +312,6 @@ def get_model_answers(
 | 
				
			||||||
            torch_dtype=torch.float16,
 | 
					            torch_dtype=torch.float16,
 | 
				
			||||||
            # torch_dtype=torch.float32,
 | 
					            # torch_dtype=torch.float32,
 | 
				
			||||||
            low_cpu_mem_usage=True,
 | 
					            low_cpu_mem_usage=True,
 | 
				
			||||||
            # load_in_8bit=True,
 | 
					 | 
				
			||||||
            total_token=args.total_token,
 | 
					            total_token=args.total_token,
 | 
				
			||||||
            depth=args.depth,
 | 
					            depth=args.depth,
 | 
				
			||||||
            top_k=args.top_k,
 | 
					            top_k=args.top_k,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -233,7 +233,6 @@ class _BaseAutoModelClass:
 | 
				
			||||||
            optimize_model = False
 | 
					            optimize_model = False
 | 
				
			||||||
            kwargs["modules_to_not_convert"] = ["lm_head"]
 | 
					            kwargs["modules_to_not_convert"] = ["lm_head"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        load_in_8bit = kwargs.pop("load_in_8bit", False)
 | 
					 | 
				
			||||||
        from ipex_llm.llm_patching import bigdl_patched
 | 
					        from ipex_llm.llm_patching import bigdl_patched
 | 
				
			||||||
        if bigdl_patched == 'Train':
 | 
					        if bigdl_patched == 'Train':
 | 
				
			||||||
            global patched_training_mode
 | 
					            global patched_training_mode
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -117,7 +117,6 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        # ignore following arguments
 | 
					        # ignore following arguments
 | 
				
			||||||
        ignore_argument(kwargs, "model_hub")
 | 
					        ignore_argument(kwargs, "model_hub")
 | 
				
			||||||
        ignore_argument(kwargs, "load_in_4bit")
 | 
					        ignore_argument(kwargs, "load_in_4bit")
 | 
				
			||||||
        ignore_argument(kwargs, "load_in_8bit")
 | 
					 | 
				
			||||||
        ignore_argument(kwargs, "imatrix")
 | 
					        ignore_argument(kwargs, "imatrix")
 | 
				
			||||||
        ignore_argument(kwargs, "cpu_embedding")
 | 
					        ignore_argument(kwargs, "cpu_embedding")
 | 
				
			||||||
        ignore_argument(kwargs, "embedding_qtype")
 | 
					        ignore_argument(kwargs, "embedding_qtype")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue