diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 45f38cd8..3f20fe55 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -90,6 +90,7 @@ COPY ./vllm_offline_inference.py /llm/ COPY ./payload-1024.lua /llm/ COPY ./start-vllm-service.sh /llm/ COPY ./benchmark_vllm_throughput.py /llm/ +COPY ./benchmark_vllm_latency.py /llm/ COPY ./start-fastchat-service.sh /llm/ COPY ./start-pp_serving-service.sh /llm/ COPY ./start-lightweight_serving-service.sh /llm/ diff --git a/docker/llm/serving/xpu/docker/benchmark_vllm_latency.py b/docker/llm/serving/xpu/docker/benchmark_vllm_latency.py new file mode 100644 index 00000000..3a80fcbb --- /dev/null +++ b/docker/llm/serving/xpu/docker/benchmark_vllm_latency.py @@ -0,0 +1,315 @@ +"""Benchmark the latency of processing a single batch of requests.""" +import argparse +import json +import time +from pathlib import Path +from typing import List, Optional + +import numpy as np +import torch +from tqdm import tqdm + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM +from vllm.inputs import PromptInputs +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import FlexibleArgumentParser + + +def main(args: argparse.Namespace): + print(args) + + # NOTE(woosuk): If the request cannot be processed in a single batch, + # the engine will automatically process the request in multiple batches. + llm = LLM( + model=args.model, + speculative_model=args.speculative_model, + num_speculative_tokens=args.num_speculative_tokens, + speculative_draft_tensor_parallel_size=\ + args.speculative_draft_tensor_parallel_size, + tokenizer=args.tokenizer, + quantization=args.quantization, + tensor_parallel_size=args.tensor_parallel_size, + trust_remote_code=args.trust_remote_code, + dtype=args.dtype, + max_model_len=args.max_model_len, + enforce_eager=args.enforce_eager, + kv_cache_dtype=args.kv_cache_dtype, + quantization_param_path=args.quantization_param_path, + device=args.device, + ray_workers_use_nsight=args.ray_workers_use_nsight, + use_v2_block_manager=args.use_v2_block_manager, + enable_chunked_prefill=args.enable_chunked_prefill, + download_dir=args.download_dir, + block_size=args.block_size, + gpu_memory_utilization=args.gpu_memory_utilization, + load_format=args.load_format, + distributed_executor_backend=args.distributed_executor_backend, + otlp_traces_endpoint=args.otlp_traces_endpoint, + enable_prefix_caching=args.enable_prefix_caching, + load_in_low_bit=args.load_in_low_bit, + max_num_batched_tokens=args.max_num_batched_tokens, + max_num_seqs=args.max_num_seqs, + ) + + sampling_params = SamplingParams( + n=args.n, + temperature=0.0 if args.use_beam_search else 1.0, + top_p=1.0, + use_beam_search=args.use_beam_search, + ignore_eos=True, + max_tokens=args.output_len, + ) + print(sampling_params) + dummy_prompt_token_ids = np.random.randint(10000, + size=(args.batch_size, + args.input_len)) + dummy_inputs: List[PromptInputs] = [{ + "prompt_token_ids": batch + } for batch in dummy_prompt_token_ids.tolist()] + + def run_to_completion(profile_dir: Optional[str] = None): + if profile_dir: + if args.device == "xpu": + with torch.autograd.profiler_legacy.profile(enabled=True, use_xpu=True) as p: + llm.generate(dummy_inputs, + sampling_params=sampling_params, + use_tqdm=False) + print("Sort by CPU time total...") + print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1)) + print("Sort by XPU time total...") + print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1)) + else: + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + # torch.profiler.ProfilerActivity.XPU, + torch.profiler.ProfilerActivity.CUDA, + ], + on_trace_ready=torch.profiler.tensorboard_trace_handler( + str(profile_dir))) as p: + llm.generate(dummy_inputs, + sampling_params=sampling_params, + use_tqdm=False) + print(p.key_averages()) + else: + start_time = time.perf_counter() + llm.generate(dummy_inputs, + sampling_params=sampling_params, + use_tqdm=False) + end_time = time.perf_counter() + latency = end_time - start_time + return latency + + print("Warming up...") + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): + run_to_completion(profile_dir=None) + + if args.profile: + profile_dir = args.profile_result_dir + if not profile_dir: + profile_dir = Path( + "." + ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" + print(f"Profiling (results will be saved to '{profile_dir}')...") + run_to_completion(profile_dir=profile_dir) + return + + # Benchmark. + latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): + latencies.append(run_to_completion(profile_dir=None)) + latencies = np.array(latencies) + percentages = [10, 25, 50, 75, 90, 99] + percentiles = np.percentile(latencies, percentages) + print(f'Avg latency: {np.mean(latencies)} seconds') + for percentage, percentile in zip(percentages, percentiles): + print(f'{percentage}% percentile latency: {percentile} seconds') + + # Output JSON results if specified + if args.output_json: + results = { + "avg_latency": np.mean(latencies), + "latencies": latencies.tolist(), + "percentiles": dict(zip(percentages, percentiles.tolist())), + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + + +if __name__ == '__main__': + parser = FlexibleArgumentParser( + description='Benchmark the latency of processing a single batch of ' + 'requests till completion.') + parser.add_argument('--model', type=str, default='facebook/opt-125m') + parser.add_argument('--speculative-model', type=str, default=None) + parser.add_argument('--num-speculative-tokens', type=int, default=None) + parser.add_argument('--speculative-draft-tensor-parallel-size', + '-spec-draft-tp', + type=int, + default=None) + parser.add_argument('--tokenizer', type=str, default=None) + parser.add_argument('--quantization', + '-q', + choices=[*QUANTIZATION_METHODS, None], + default=None) + parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) + parser.add_argument('--input-len', type=int, default=32) + parser.add_argument('--output-len', type=int, default=128) + parser.add_argument('--batch-size', type=int, default=8) + parser.add_argument('--n', + type=int, + default=1, + help='Number of generated sequences per prompt.') + parser.add_argument('--use-beam-search', action='store_true') + parser.add_argument('--num-iters-warmup', + type=int, + default=10, + help='Number of iterations to run for warmup.') + parser.add_argument('--num-iters', + type=int, + default=30, + help='Number of iterations to run.') + parser.add_argument('--trust-remote-code', + action='store_true', + help='trust remote code from huggingface') + parser.add_argument( + '--max-model-len', + type=int, + default=None, + help='Maximum length of a sequence (including prompt and output). ' + 'If None, will be derived from the model.') + parser.add_argument( + '--dtype', + type=str, + default='auto', + choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], + help='data type for model weights and activations. ' + 'The "auto" option will use FP16 precision ' + 'for FP32 and FP16 models, and BF16 precision ' + 'for BF16 models.') + parser.add_argument('--enforce-eager', + action='store_true', + help='enforce eager mode and disable CUDA graph') + parser.add_argument( + '--kv-cache-dtype', + type=str, + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], + default="auto", + help='Data type for kv cache storage. If "auto", will use model ' + 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' + 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') + parser.add_argument( + '--quantization-param-path', + type=str, + default=None, + help='Path to the JSON file containing the KV cache scaling factors. ' + 'This should generally be supplied, when KV cache dtype is FP8. ' + 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' + 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' + 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' + 'instead supported for common inference criteria.') + parser.add_argument( + '--profile', + action='store_true', + help='profile the generation process of a single batch') + parser.add_argument( + '--profile-result-dir', + type=str, + default=None, + help=('path to save the pytorch profiler output. Can be visualized ' + 'with ui.perfetto.dev or Tensorboard.')) + parser.add_argument( + "--device", + type=str, + default="auto", + choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], + help='device type for vLLM execution, supporting CUDA, OpenVINO and ' + 'CPU.') + parser.add_argument('--block-size', + type=int, + default=16, + help='block size of key/value cache') + parser.add_argument( + '--enable-chunked-prefill', + action='store_true', + help='If True, the prefill requests can be chunked based on the ' + 'max_num_batched_tokens') + parser.add_argument("--enable-prefix-caching", + action='store_true', + help="Enable automatic prefix caching") + parser.add_argument('--use-v2-block-manager', action='store_true') + parser.add_argument( + "--ray-workers-use-nsight", + action='store_true', + help="If specified, use nsight to profile ray workers", + ) + parser.add_argument('--download-dir', + type=str, + default=None, + help='directory to download and load the weights, ' + 'default to the default cache dir of huggingface') + parser.add_argument( + '--output-json', + type=str, + default=None, + help='Path to save the latency results in JSON format.') + parser.add_argument('--gpu-memory-utilization', + type=float, + default=0.9, + help='the fraction of GPU memory to be used for ' + 'the model executor, which can range from 0 to 1.' + 'If unspecified, will use the default value of 0.9.') + parser.add_argument( + '--load-format', + type=str, + default=EngineArgs.load_format, + choices=[ + 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', + 'bitsandbytes' + ], + help='The format of the model weights to load.\n\n' + '* "auto" will try to load the weights in the safetensors format ' + 'and fall back to the pytorch bin format if safetensors format ' + 'is not available.\n' + '* "pt" will load the weights in the pytorch bin format.\n' + '* "safetensors" will load the weights in the safetensors format.\n' + '* "npcache" will load the weights in pytorch format and store ' + 'a numpy cache to speed up the loading.\n' + '* "dummy" will initialize the weights with random values, ' + 'which is mainly for profiling.\n' + '* "tensorizer" will load the weights using tensorizer from ' + 'CoreWeave. See the Tensorize vLLM Model script in the Examples' + 'section for more information.\n' + '* "bitsandbytes" will load the weights using bitsandbytes ' + 'quantization.\n') + parser.add_argument( + '--distributed-executor-backend', + choices=['ray', 'mp'], + default=None, + help='Backend to use for distributed serving. When more than 1 GPU ' + 'is used, will be automatically set to "ray" if installed ' + 'or "mp" (multiprocessing) otherwise.') + parser.add_argument( + '--otlp-traces-endpoint', + type=str, + default=None, + help='Target URL to which OpenTelemetry traces will be sent.') + parser.add_argument( + "--load-in-low-bit", + type=str, + choices=["sym_int4", "fp8", "fp8_e4m3", "fp16", "fp6"], + default="sym_int4", + help="Low-bit format quantization with IPEX-LLM") + parser.add_argument('--max-num-batched-tokens', + type=int, + default=4096, + help='maximum number of batched tokens per iteration') + + parser.add_argument('--max-num-seqs', + type=int, + default=256, + help='Maximum number of sequences per iteration.') + args = parser.parse_args() + main(args)