From db8e90796af7a1fec8289d8ddc5715f66c5797ed Mon Sep 17 00:00:00 2001 From: binbin Deng <108676127+plusbang@users.noreply.github.com> Date: Fri, 19 Jan 2024 15:09:57 +0800 Subject: [PATCH] LLM: add avg token latency information and benchmark guide of autotp (#9940) --- python/llm/dev/benchmark/README.md | 24 +++++++++++++++++++ .../example/GPU/Deepspeed-AutoTP/README.md | 13 ++++++++++ .../GPU/Deepspeed-AutoTP/deepspeed_autotp.py | 4 +++- 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/python/llm/dev/benchmark/README.md b/python/llm/dev/benchmark/README.md index 89c2fa6c..bd297133 100644 --- a/python/llm/dev/benchmark/README.md +++ b/python/llm/dev/benchmark/README.md @@ -29,6 +29,7 @@ Output will be like: ``` ## GPU Usage +### Inference on single GPU Just put this file into your benchmark directory, and then wrap your transformer int4 model with `BenchmarkWrapper` (`model = BenchmarkWrapper(model)`). Take `chatglm-6b` as an example: ```python @@ -57,6 +58,29 @@ with torch.inference_mode(): output = model.generate(input_ids, do_sample=False, max_new_tokens=32) output_str = tokenizer.decode(output[0], skip_special_tokens=True) ``` + +### Inference on multi GPUs +Similarly, put this file into your benchmark directory, and then wrap your optimized model with `BenchmarkWrapper` (`model = BenchmarkWrapper(model)`). +For example, just need to apply following code patch on [Deepspeed Autotp example code](https://github.com/intel-analytics/BigDL/blob/main/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py) to calculate 1st and the rest token performance: +```python + import torch + import transformers + import deepspeed ++from benchmark_util import BenchmarkWrapper + + def get_int_from_env(env_keys, default): + """Returns the first positive env value found in the `env_keys` list or the default.""" +@@ -98,6 +99,7 @@ if __name__ == '__main__': + init_distributed() + + print(model) ++ model = BenchmarkWrapper(model, do_print=True) + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) +``` + +### Sample Output Output will be like: ```bash =========First token cost xx.xxxxs========= diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/README.md b/python/llm/example/GPU/Deepspeed-AutoTP/README.md index 50f02c1b..94757822 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/README.md +++ b/python/llm/example/GPU/Deepspeed-AutoTP/README.md @@ -47,6 +47,19 @@ bash run_vicuna_33b_arc_2_card.sh > **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine. +### 3. Sample Output + +```bash +[0] Inference time of generating 32 tokens: xxx s, average token latency is xxx ms/token. +[0] -------------------- Prompt -------------------- +[0] Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun +[0] -------------------- Output -------------------- +[0] Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. She was a curious girl, and she loved to learn new things. +[0] +[0] One day, she decided to go on a journey to find the legendary +``` + +**Important**: The first token latency is much larger than rest token latency, you could use [our benchmark tool](https://github.com/intel-analytics/BigDL/blob/main/python/llm/dev/benchmark/README.md) to obtain more details about first and rest token latency. ### Known Issue - In our example scripts, tcmalloc is enabled through `export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}` which speed up inference, but this may raise `munmap_chunk(): invalid pointer` error after finishing inference. diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py index 13ee9d65..f69c66e0 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -124,8 +124,10 @@ if __name__ == '__main__': end = time.time() if local_rank == 0: output = output.cpu() + actual_output_len = output.shape[1] - input_ids.shape[1] output_str = tokenizer.decode(output[0], skip_special_tokens=True) - print(f'Inference time: {end-st} s') + avg_time = (end - st) / actual_output_len * 1000 + print(f'Inference time of generating {actual_output_len} tokens: {end-st} s, average token latency is {avg_time} ms/token.') print('-'*20, 'Prompt', '-'*20) print(prompt) print('-'*20, 'Output', '-'*20)