enable inference mode for deepspeed tp serving (#11742)
This commit is contained in:
parent
9e65cf00b3
commit
107f7aafd0
1 changed files with 3 additions and 1 deletions
|
|
@ -116,11 +116,13 @@ def load_model(model_path, low_bit):
|
||||||
# Use IPEX-LLM `optimize_model` to convert the model into optimized low bit format
|
# Use IPEX-LLM `optimize_model` to convert the model into optimized low bit format
|
||||||
# Convert the rest of the model into float16 to reduce allreduce traffic
|
# Convert the rest of the model into float16 to reduce allreduce traffic
|
||||||
model = optimize_model(model.module.to(f"cpu"), low_bit=low_bit).to(torch.float16)
|
model = optimize_model(model.module.to(f"cpu"), low_bit=low_bit).to(torch.float16)
|
||||||
|
|
||||||
# Next, use XPU as accelerator to speed up inference
|
# Next, use XPU as accelerator to speed up inference
|
||||||
current_accel = XPU_Accelerator()
|
current_accel = XPU_Accelerator()
|
||||||
set_accelerator(current_accel)
|
set_accelerator(current_accel)
|
||||||
|
|
||||||
|
model=model.eval()
|
||||||
|
|
||||||
# Move model back to xpu
|
# Move model back to xpu
|
||||||
model = model.to(f"xpu:{local_rank}")
|
model = model.to(f"xpu:{local_rank}")
|
||||||
model = BenchmarkWrapper(model)
|
model = BenchmarkWrapper(model)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue