From 762ad49362ab8500bd5d152d917c6fc7747b0c89 Mon Sep 17 00:00:00 2001 From: Qiyuan Gong Date: Thu, 1 Aug 2024 18:16:21 +0800 Subject: [PATCH] Add RANK_WAIT_TIME into DeepSpeed-AutoTP to avoid CPU memory OOM (#11704) * DeepSpeed-AutoTP will start multiple processors to load models and convert them in CPU memory. If model/rank_num is large, this will lead to OOM. Add RANK_WAIT_TIME to reduce memory usage by controlling model reading parallelism. --- python/llm/example/GPU/Deepspeed-AutoTP/README.md | 1 + python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/README.md b/python/llm/example/GPU/Deepspeed-AutoTP/README.md index 8c5f5365..515a74fe 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/README.md +++ b/python/llm/example/GPU/Deepspeed-AutoTP/README.md @@ -87,3 +87,4 @@ bash run_mistral_7b_instruct_flex_2_card.sh ### Known Issue - In our example scripts, tcmalloc is enabled through `export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}` which speed up inference, but this may raise `munmap_chunk(): invalid pointer` error after finishing inference. +- CPU memory OOM during model covert. In this example, multiple processors will loading models into memory at the same time. If model size/rank_num is very large, it will lead to OOM. Please `export RANK_WAIT_TIME=xxx`. `xxx` is sleep time in seconds. You can increase `RANK_WAIT_TIME` to avoid using too much memory. \ No newline at end of file diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py index f16a670f..e2ebbaf4 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -66,6 +66,11 @@ if __name__ == '__main__': # Convert to deepspeed model and apply IPEX-LLM optimization on CPU to decrease GPU memory usage current_accel = CPU_Accelerator() set_accelerator(current_accel) + # Avoid OOM caused by parallel loading models into CPU memory + # Please increase RANK_WAIT_TIME to avoid using too much memory. + rank_wait_time = os.environ.get("RANK_WAIT_TIME", 0) + if rank_wait_time != 0: + time.sleep(local_rank * rank_wait_time) model = AutoModelForCausalLM.from_pretrained(args.repo_id_or_model_path, device_map={"": "cpu"}, low_cpu_mem_usage=True,