diff --git a/docker/llm/serving/xpu/docker/vllm_online_benchmark.py b/docker/llm/serving/xpu/docker/vllm_online_benchmark.py index 59bfb588..429a9a18 100644 --- a/docker/llm/serving/xpu/docker/vllm_online_benchmark.py +++ b/docker/llm/serving/xpu/docker/vllm_online_benchmark.py @@ -176,7 +176,7 @@ def benchmark(llm_urls, model, prompt, num_requests, max_concurrent_requests, ma cur_len = len(cur_llm_urls) if dataset is None: payload = { - "model": model_name, + "model": model, "prompt": prompt, "n": 1, "best_of": 1, @@ -193,7 +193,7 @@ def benchmark(llm_urls, model, prompt, num_requests, max_concurrent_requests, ma for index in range(num_requests): prompt, prompt_len, output_len = sampled_requests[index] payload = { - "model": model_name, + "model": model, "prompt": prompt, "n": 1, "best_of": 1, @@ -279,7 +279,7 @@ max_batch=int(max_seq) for MAX_CONCURRENT_REQUESTS in [max_batch]: NUM_WARMUP = 2 * MAX_CONCURRENT_REQUESTS - NUM_REQUESTS = 5 * MAX_CONCURRENT_REQUESTS # 总请求次数 + NUM_REQUESTS = 4 * MAX_CONCURRENT_REQUESTS # 总请求次数 # to avoid warm_up time out benchmark(LLM_URLS, MODEL, PROMPT_1024, 2, 1, 32, is_warmup = True)