ipex-llm/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py
2024-06-07 09:33:14 +08:00

268 lines
9.4 KiB
Python

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import requests
import time
from concurrent.futures import ThreadPoolExecutor
import concurrent
import numpy as np
from tqdm import tqdm
import json
import argparse
from typing import List, Tuple
# Execute single request
def perform_request(session, url, payload, headers):
start_time = time.perf_counter()
with session.post(url, json=payload, headers=headers, stream=True) as response:
response.raise_for_status()
first_token_time = None
last_token_time = 0
first_token_inference_time = None
next_token_inference_time = None
next_token_time = []
i = 0
for line in response.iter_lines():
token_time = time.perf_counter() - start_time
if line:
data = line.decode("utf-8").strip()
i = i + 1
try:
json_data = json.loads(data)
if json_data["message"] is not None:
if first_token_time is None:
first_token_time = token_time
else:
next_token_time.append(token_time - last_token_time)
last_token_time = token_time
except json.JSONDecodeError:
pass
end_time = time.perf_counter()
return (
first_token_time,
np.mean(next_token_time),
end_time - start_time,
first_token_inference_time,
next_token_inference_time,
)
def extend_list_to_length(lst, target_length):
if target_length <= len(lst):
return lst[:]
times = target_length // len(lst)
remainder = target_length % len(lst)
extended_list = lst * times + lst[:remainder]
return extended_list
def benchmark(
llm_urls,
prompt,
num_requests,
max_concurrent_requests,
max_tokens,
is_warmup=False,
):
headers = {"Content-Type": "application/json"}
first_token_latencies = []
next_token_latencies = []
total_responce_times = []
first_token_inference_times = []
next_token_inference_times = []
cur_url_index = 0
with requests.Session() as session:
with ThreadPoolExecutor(max_workers=max_concurrent_requests) as executor:
llm_url = llm_urls[cur_url_index]
cur_url_index = (cur_url_index + 1) % len(llm_urls)
cur_llm_urls = extend_list_to_length(llm_urls, max_concurrent_requests)
cur_len = len(cur_llm_urls)
payload = {
"prompt": prompt,
"n_predict": max_tokens,
}
futures = [
executor.submit(
perform_request,
session,
cur_llm_urls[index % cur_len],
payload,
headers,
)
for index in range(num_requests)
]
start_time = time.perf_counter()
if is_warmup:
phase = "Warm Up"
else:
phase = "Benchmarking"
with tqdm(total=num_requests, desc=phase, unit="req", ncols=100) as pbar:
for future in concurrent.futures.as_completed(futures):
try:
(
first_token_latency,
next_token_latency,
total_responce_time,
first_token_inference_time,
next_token_inference_time,
) = future.result()
first_token_latencies.append(first_token_latency)
next_token_latencies.append(next_token_latency)
total_responce_times.append(total_responce_time)
if first_token_inference_time:
first_token_inference_times.append(
first_token_inference_time
)
if next_token_inference_time:
next_token_inference_times.append(next_token_inference_time)
except Exception as e:
print(f"Request failed: {e}")
pbar.update(1)
if is_warmup:
return
total_time = time.perf_counter() - start_time
log_file = f"{max_concurrent_requests}.log"
with open(log_file, "w") as file:
print(
f"Total time for {num_requests} requests with {max_concurrent_requests} concurrent requests: {total_time} seconds.",
file=file,
)
print(
f"Average response time: {np.mean(total_responce_times)}", file=file
)
print(
f"Token throughput: {num_requests * max_tokens / total_time}",
file=file,
)
print(
f"Total token throughput: {(128 + 1024) * num_requests / total_time}",
file=file,
)
print(file=file)
if first_token_latencies:
average_first_token_latency = sum(first_token_latencies) / len(
first_token_latencies
)
p90_first_token_latency = np.percentile(first_token_latencies, 90)
p95_first_token_latency = np.percentile(first_token_latencies, 95)
average_first_token_inference_latency = np.mean(
first_token_inference_times
)
print(
f"Average first token latency: {average_first_token_latency * 1000} milliseconds.",
file=file,
)
print(
f"P90 first token latency: {p90_first_token_latency * 1000} milliseconds.",
file=file,
)
print(
f"P95 first token latency: {p95_first_token_latency * 1000} milliseconds.",
file=file,
)
print(
f"Average first token inference latency: {average_first_token_inference_latency * 1000} milliseconds.",
file=file,
)
print(file=file)
if next_token_latencies:
average_next_token_latency = sum(next_token_latencies) / len(
next_token_latencies
)
p90_next_token_latency = np.percentile(next_token_latencies, 90)
p95_next_token_latency = np.percentile(next_token_latencies, 95)
average_next_token_inference_latency = np.mean(
next_token_inference_times
)
print(
f"Average next token latency: {average_next_token_latency * 1000} milliseconds.",
file=file,
)
print(
f"P90 next token latency: {p90_next_token_latency * 1000} milliseconds.",
file=file,
)
print(
f"P95 next token latency: {p95_next_token_latency * 1000} milliseconds.",
file=file,
)
print(
f"Average next token inference latency: {average_next_token_inference_latency * 1000} milliseconds.",
file=file,
)
print(file=file)
LLM_URLS = [f"http://localhost:{PORT}/generate_stream/" for PORT in [8000]]
parser = argparse.ArgumentParser(description="Set prompt length.")
parser.add_argument(
"--prompt_length",
type=int,
choices=[32, 1024, 2048],
default=1024,
help="Length of the prompt: 32, 1024, or 2048",
)
parser.add_argument(
"--max_concurrent_requests",
type=int,
nargs="+",
default=[1, 2, 4, 5, 6],
help="List of maximum concurrent requests to test.",
)
parser.add_argument(
"--max_new_tokens",
type=int,
default=128,
help="Maximum number of new tokens that the model will generate per request.",
)
args = parser.parse_args()
PROMPT_LENGTH = args.prompt_length
PROMPT = open(f"prompt/continuation/{PROMPT_LENGTH}.txt", "r").read()
MAX_TOKENS = args.max_new_tokens
for MAX_CONCURRENT_REQUESTS in args.max_concurrent_requests:
NUM_WARMUP = 5 * MAX_CONCURRENT_REQUESTS
NUM_REQUESTS = 10 * MAX_CONCURRENT_REQUESTS
# warm up
benchmark(
LLM_URLS,
PROMPT,
NUM_WARMUP,
MAX_CONCURRENT_REQUESTS,
MAX_TOKENS,
is_warmup=True,
)
benchmark(LLM_URLS, PROMPT, NUM_REQUESTS, MAX_CONCURRENT_REQUESTS, MAX_TOKENS)