enable inference mode for deepspeed tp serving (#11742)
This commit is contained in:
		
							parent
							
								
									9e65cf00b3
								
							
						
					
					
						commit
						107f7aafd0
					
				
					 1 changed files with 3 additions and 1 deletions
				
			
		| 
						 | 
					@ -121,6 +121,8 @@ def load_model(model_path, low_bit):
 | 
				
			||||||
    current_accel = XPU_Accelerator()
 | 
					    current_accel = XPU_Accelerator()
 | 
				
			||||||
    set_accelerator(current_accel)
 | 
					    set_accelerator(current_accel)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    model=model.eval()
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    # Move model back to xpu
 | 
					    # Move model back to xpu
 | 
				
			||||||
    model = model.to(f"xpu:{local_rank}")
 | 
					    model = model.to(f"xpu:{local_rank}")
 | 
				
			||||||
    model = BenchmarkWrapper(model)
 | 
					    model = BenchmarkWrapper(model)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue