empty cache only for 1st token but rest token to speed up (#11665)

2024-07-26 16:46:21 +08:00 · 2024-07-26 16:46:21 +08:00 · ba01b85c13
commit ba01b85c13
parent fc7f8feb83
1 changed files with 6 additions and 2 deletions
--- a/python/llm/src/ipex_llm/transformers/pipeline_parallel.py
+++ b/python/llm/src/ipex_llm/transformers/pipeline_parallel.py
@ -959,9 +959,13 @@ def llama_causallm_forward_4_37_lowmem(
        logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]  # noqa
        logits = torch.cat(logits, dim=-1)
    else:
-        torch.xpu.empty_cache()
+        # Only empty cache for first token
+        if hidden_states.shape[1] > 1:
+            torch.xpu.empty_cache()
        logits = self.lm_head(hidden_states)
-        torch.xpu.empty_cache()
+        # Only empty cache for first token
+        if hidden_states.shape[1] > 1:
+            torch.xpu.empty_cache()
    # logits = logits.float()

    # ipex-llm change ends