From 2e75bbccf97eab84db2f151604b882b5cf3c74ee Mon Sep 17 00:00:00 2001 From: Guancheng Fu <110874468+gc-fu@users.noreply.github.com> Date: Wed, 12 Jun 2024 17:43:06 +0800 Subject: [PATCH] Add more control arguments for benchmark_vllm_throughput (#11291) --- .../xpu/docker/benchmark_vllm_throughput.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py b/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py index 6defa576..a1102f8a 100644 --- a/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py +++ b/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py @@ -77,6 +77,7 @@ def run_vllm( gpu_memory_utilization: float = 0.9, load_in_low_bit: str = "sym_int4", max_num_batched_tokens: int = 5000, + max_num_seqs: int = 256, ) -> float: from vllm import SamplingParams from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM @@ -94,7 +95,8 @@ def run_vllm( device=device, enable_prefix_caching=enable_prefix_caching, load_in_low_bit=load_in_low_bit, - max_num_batched_tokens=max_num_batched_tokens,) + max_num_batched_tokens=max_num_batched_tokens, + max_num_seqs=max_num_seqs,) # Add the requests to the engine. @@ -238,7 +240,8 @@ def main(args: argparse.Namespace): args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, args.kv_cache_dtype, args.device, - args.enable_prefix_caching, args.gpu_memory_utilization, args.load_in_low_bit, args.max_num_batched_tokens) + args.enable_prefix_caching, args.gpu_memory_utilization, args.load_in_low_bit, + args.max_num_batched_tokens,args.max_num_seqs) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -348,9 +351,15 @@ if __name__ == "__main__": parser.add_argument('--max-num-batched-tokens', type=int, - default=5000, + default=4096, help='maximum number of batched tokens per iteration') + parser.add_argument('--max-num-seqs', + type=int, + default=256, + help='Maximum number of sequences per iteration.') + + args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model