ipex-llm/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import requests
import time
from concurrent.futures import ThreadPoolExecutor
import concurrent
import numpy as np
from tqdm import tqdm
import json
import argparse
from typing import List, Tuple


# Execute single request
def perform_request(session, url, payload, headers):
    start_time = time.perf_counter()
    with session.post(url, json=payload, headers=headers, stream=True) as response:
        response.raise_for_status()

        first_token_time = None
        last_token_time = 0
        first_token_inference_time = None
        next_token_inference_time = None
        next_token_time = []
        i = 0
        for line in response.iter_lines():

            token_time = time.perf_counter() - start_time
            if line:
                data = line.decode("utf-8").strip()
                i = i + 1
                try:
                    json_data = json.loads(data)
                    if json_data["message"] is not None:
                        if first_token_time is None:
                            first_token_time = token_time
                        else:
                            next_token_time.append(token_time - last_token_time)
                        last_token_time = token_time
                except json.JSONDecodeError:
                    pass
        end_time = time.perf_counter()
        return (
            first_token_time,
            np.mean(next_token_time),
            end_time - start_time,
            first_token_inference_time,
            next_token_inference_time,
        )


def extend_list_to_length(lst, target_length):
    if target_length <= len(lst):
        return lst[:]
    times = target_length // len(lst)
    remainder = target_length % len(lst)
    extended_list = lst * times + lst[:remainder]

    return extended_list


def benchmark(
    llm_urls,
    prompt,
    num_requests,
    max_concurrent_requests,
    max_tokens,
    is_warmup=False,
):

    headers = {"Content-Type": "application/json"}

    first_token_latencies = []
    next_token_latencies = []
    total_responce_times = []
    first_token_inference_times = []
    next_token_inference_times = []
    cur_url_index = 0

    with requests.Session() as session:
        with ThreadPoolExecutor(max_workers=max_concurrent_requests) as executor:
            llm_url = llm_urls[cur_url_index]
            cur_url_index = (cur_url_index + 1) % len(llm_urls)

            cur_llm_urls = extend_list_to_length(llm_urls, max_concurrent_requests)
            cur_len = len(cur_llm_urls)

            payload = {
                "prompt": prompt,
                "n_predict": max_tokens,
            }
            futures = [
                executor.submit(
                    perform_request,
                    session,
                    cur_llm_urls[index % cur_len],
                    payload,
                    headers,
                )
                for index in range(num_requests)
            ]

            start_time = time.perf_counter()

            if is_warmup:
                phase = "Warm Up"
            else:
                phase = "Benchmarking"
            with tqdm(total=num_requests, desc=phase, unit="req", ncols=100) as pbar:
                for future in concurrent.futures.as_completed(futures):
                    try:
                        (
                            first_token_latency,
                            next_token_latency,
                            total_responce_time,
                            first_token_inference_time,
                            next_token_inference_time,
                        ) = future.result()
                        first_token_latencies.append(first_token_latency)
                        next_token_latencies.append(next_token_latency)
                        total_responce_times.append(total_responce_time)
                        if first_token_inference_time:
                            first_token_inference_times.append(
                                first_token_inference_time
                            )
                        if next_token_inference_time:
                            next_token_inference_times.append(next_token_inference_time)
                    except Exception as e:
                        print(f"Request failed: {e}")
                    pbar.update(1)

            if is_warmup:
                return
            total_time = time.perf_counter() - start_time
            log_file = f"{max_concurrent_requests}.log"

            with open(log_file, "w") as file:
                print(
                    f"Total time for {num_requests} requests with {max_concurrent_requests} concurrent requests: {total_time} seconds.",
                    file=file,
                )
                print(
                    f"Average response time: {np.mean(total_responce_times)}", file=file
                )

                print(
                    f"Token throughput: {num_requests * max_tokens / total_time}",
                    file=file,
                )
                print(
                    f"Total token throughput: {(128 + 1024) * num_requests / total_time}",
                    file=file,
                )
                print(file=file)

                if first_token_latencies:
                    average_first_token_latency = sum(first_token_latencies) / len(
                        first_token_latencies
                    )
                    p90_first_token_latency = np.percentile(first_token_latencies, 90)
                    p95_first_token_latency = np.percentile(first_token_latencies, 95)
                    average_first_token_inference_latency = np.mean(
                        first_token_inference_times
                    )
                    print(
                        f"Average first token latency: {average_first_token_latency * 1000} milliseconds.",
                        file=file,
                    )
                    print(
                        f"P90 first token latency: {p90_first_token_latency * 1000} milliseconds.",
                        file=file,
                    )
                    print(
                        f"P95 first token latency: {p95_first_token_latency * 1000} milliseconds.",
                        file=file,
                    )
                    print(
                        f"Average first token inference latency: {average_first_token_inference_latency * 1000} milliseconds.",
                        file=file,
                    )
                    print(file=file)

                if next_token_latencies:
                    average_next_token_latency = sum(next_token_latencies) / len(
                        next_token_latencies
                    )
                    p90_next_token_latency = np.percentile(next_token_latencies, 90)
                    p95_next_token_latency = np.percentile(next_token_latencies, 95)
                    average_next_token_inference_latency = np.mean(
                        next_token_inference_times
                    )
                    print(
                        f"Average next token latency: {average_next_token_latency * 1000} milliseconds.",
                        file=file,
                    )
                    print(
                        f"P90 next token latency: {p90_next_token_latency * 1000} milliseconds.",
                        file=file,
                    )
                    print(
                        f"P95 next token latency: {p95_next_token_latency * 1000} milliseconds.",
                        file=file,
                    )
                    print(
                        f"Average next token inference latency: {average_next_token_inference_latency * 1000} milliseconds.",
                        file=file,
                    )
                    print(file=file)


LLM_URLS = [f"http://localhost:{PORT}/generate_stream/" for PORT in [8000]]

parser = argparse.ArgumentParser(description="Set prompt length.")
parser.add_argument(
    "--prompt_length",
    type=int,
    choices=[32, 1024, 2048],
    default=1024,
    help="Length of the prompt: 32, 1024, or 2048",
)
parser.add_argument(
    "--max_concurrent_requests",
    type=int,
    nargs="+",
    default=[1, 2, 4, 5, 6],
    help="List of maximum concurrent requests to test.",
)
parser.add_argument(
    "--max_new_tokens",
    type=int,
    default=128,
    help="Maximum number of new tokens that the model will generate per request.",
)
args = parser.parse_args()
PROMPT_LENGTH = args.prompt_length
PROMPT = open(f"prompt/continuation/{PROMPT_LENGTH}.txt", "r").read()
MAX_TOKENS = args.max_new_tokens


for MAX_CONCURRENT_REQUESTS in args.max_concurrent_requests:
    NUM_WARMUP = 5 * MAX_CONCURRENT_REQUESTS
    NUM_REQUESTS = 10 * MAX_CONCURRENT_REQUESTS

    # warm up
    benchmark(
        LLM_URLS,
        PROMPT,
        NUM_WARMUP,
        MAX_CONCURRENT_REQUESTS,
        MAX_TOKENS,
        is_warmup=True,
    )

    benchmark(LLM_URLS, PROMPT, NUM_REQUESTS, MAX_CONCURRENT_REQUESTS, MAX_TOKENS)