[LLM] vLLM: Delete last_kv_cache before prefilling (#9619)

Remove last_kv_cache before prefilling to reduce peak memory usage.
This commit is contained in:
Xiangyu Tian 2023-12-07 11:32:33 +08:00 committed by GitHub
parent 48b85593b3
commit deee65785c

View file

@ -186,6 +186,8 @@ class BigDLLlamaForCausalLM(BigDLModelForCausalLM):
"use_cache": True,
# "return_dict": True,
}
if self.last_kv_cache:
del self.last_kv_cache
# pdb.set_trace()
if self.device.type == 'xpu':