ipex-llm/docker/llm/serving/xpu/docker/gradio_web_server.patch

--- gradio_web_server.py	2024-06-20 14:21:48.013518726 +0800
+++ gradio_web_server_new.py	2024-06-20 14:23:09.822830709 +0800
@@ -9,8 +9,10 @@
 import json
 import os
 import random
+import pandas as pd
 import time
 import uuid
+import numpy as np

 import gradio as gr
 import requests
@@ -241,7 +243,7 @@
     ip = get_ip(request)
     logger.info(f"clear_history. ip: {ip}")
     state = None
-    return (state, [], "", None) + (disable_btn,) * 5
+    return (state, [], "", None, "", "", "", "") + (disable_btn,) * 5


 def get_ip(request: gr.Request):
@@ -354,6 +356,18 @@
         return None


+def handle_latency_metrics(first_token_time, next_token_time):
+    # next token time is a numpy array...
+    # first token time might be None
+    first_token_latency = "None"
+    next_token_latency = "None"
+    if first_token_time is not None:
+        first_token_latency = str(first_token_time * 1000) + " ms"
+    if next_token_time.size > 0:
+        next_token_latency = str(np.mean(next_token_time) * 1000) + " ms"
+    return first_token_latency, next_token_latency
+
+
 def bot_response(
     state,
     temperature,
@@ -372,7 +386,7 @@
     if state.skip_next:
         # This generate call is skipped due to invalid inputs
         state.skip_next = False
-        yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+        yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
         return

     if apply_rate_limit:
@@ -381,7 +395,7 @@
             error_msg = RATE_LIMIT_MSG + "\n\n" + ret["reason"]
             logger.info(f"rate limit reached. ip: {ip}. error_msg: {ret['reason']}")
             state.conv.update_last_message(error_msg)
-            yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+            yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
             return

     conv, model_name = state.conv, state.model_name
@@ -404,6 +418,10 @@
             yield (
                 state,
                 state.to_gradio_chatbot(),
+                "None",
+                "None",
+                "None",
+                "None",
                 disable_btn,
                 disable_btn,
                 disable_btn,
@@ -444,18 +462,32 @@
         )

     conv.update_last_message("▌")
-    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+    # We probably need to change this method
+    yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (disable_btn,) * 5
+    prompt_tokens = 0
+    generated_tokens = 0
+    first_token_latency = None
+    next_token_latencies = np.array([])
+    start_time = time.time()

     try:
         for i, data in enumerate(stream_iter):
             if data["error_code"] == 0:
+                prompt_tokens = data["usage"]["prompt_tokens"]
+                generated_tokens = data["usage"]["completion_tokens"]
                 output = data["text"].strip()
                 conv.update_last_message(output + "▌")
-                yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+                if first_token_latency is None:
+                    first_token_latency = time.time() - start_time
+                else:
+                    next_token_latencies = np.append(next_token_latencies, time.time() - start_time)
+                start_time = time.time()
+                first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
+                yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (disable_btn,) * 5
             else:
                 output = data["text"] + f"\n\n(error_code: {data['error_code']})"
                 conv.update_last_message(output)
-                yield (state, state.to_gradio_chatbot()) + (
+                yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
                     disable_btn,
                     disable_btn,
                     disable_btn,
@@ -465,13 +497,14 @@
                 return
         output = data["text"].strip()
         conv.update_last_message(output)
-        yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
+        first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
+        yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (enable_btn,) * 5
     except requests.exceptions.RequestException as e:
         conv.update_last_message(
             f"{SERVER_ERROR_MSG}\n\n"
             f"(error_code: {ErrorCode.GRADIO_REQUEST_ERROR}, {e})"
         )
-        yield (state, state.to_gradio_chatbot()) + (
+        yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
             disable_btn,
             disable_btn,
             disable_btn,
@@ -484,7 +517,7 @@
             f"{SERVER_ERROR_MSG}\n\n"
             f"(error_code: {ErrorCode.GRADIO_STREAM_UNKNOWN_ERROR}, {e})"
         )
-        yield (state, state.to_gradio_chatbot()) + (
+        yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
             disable_btn,
             disable_btn,
             disable_btn,
@@ -646,7 +679,8 @@
     )

     notice_markdown = f"""
-# 🏔️ Chat with Open Large Language Models
+# 🏔️ ChatBot based Xeon-W & Arc GPUs
+###         Deployed with IPEX-LLM
 {promotion}
 """

@@ -691,6 +725,26 @@
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)

+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Performance Metrics")
+            prompt_token = gr.Textbox(
+                label="Prompt token length:",
+                interactive=False,
+            )
+            next_token = gr.Textbox(
+                label="Generated token length:",
+                interactive=False,
+            )
+            first_token_latency = gr.Textbox(
+                interactive=False,
+                label="First token Latency:",
+            )
+            next_token_latency = gr.Textbox(
+                interactive=False,
+                label="Next token Latency:",
+            )
+
     with gr.Accordion("Parameters", open=False) as parameter_row:
         temperature = gr.Slider(
             minimum=0.0,
@@ -743,9 +797,9 @@
     ).then(
         bot_response,
         [state, temperature, top_p, max_output_tokens],
-        [state, chatbot] + btn_list,
+        [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
     )
-    clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox] + btn_list)
+    clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list)

     model_selector.change(
         clear_history, None, [state, chatbot, textbox, imagebox] + btn_list
@@ -758,7 +812,7 @@
     ).then(
         bot_response,
         [state, temperature, top_p, max_output_tokens],
-        [state, chatbot] + btn_list,
+        [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
     )
     send_btn.click(
         add_text,
@@ -767,7 +821,7 @@
     ).then(
         bot_response,
         [state, temperature, top_p, max_output_tokens],
-        [state, chatbot] + btn_list,
+        [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
     )

     return [state, model_selector]
@@ -775,7 +829,7 @@

 def build_demo(models):
     with gr.Blocks(
-        title="Chat with Open Large Language Models",
+        title="ChatBot based Xeon-W & Arc GPUs",
         theme=gr.themes.Default(),
         css=block_css,
     ) as demo: