* replace gradio_web_server.patch to adjust webui * fix patch problem --------- Co-authored-by: ATMxsp01 <shou.xu@intel.com>
209 lines
8.1 KiB
Diff
209 lines
8.1 KiB
Diff
--- a/gradio_web_server.py
|
|
+++ b/gradio_web_server_new.py
|
|
@@ -9,8 +9,10 @@ import hashlib
|
|
import json
|
|
import os
|
|
import random
|
|
+import pandas as pd
|
|
import time
|
|
import uuid
|
|
+import numpy as np
|
|
|
|
import gradio as gr
|
|
import requests
|
|
@@ -241,7 +243,7 @@ def clear_history(request: gr.Request):
|
|
ip = get_ip(request)
|
|
logger.info(f"clear_history. ip: {ip}")
|
|
state = None
|
|
- return (state, [], "", None) + (disable_btn,) * 5
|
|
+ return (state, [], "", None, "", "", "", "") + (disable_btn,) * 5
|
|
|
|
|
|
def get_ip(request: gr.Request):
|
|
@@ -354,6 +356,18 @@ def is_limit_reached(model_name, ip):
|
|
return None
|
|
|
|
|
|
+def handle_latency_metrics(first_token_time, next_token_time):
|
|
+ # next token time is a numpy array...
|
|
+ # first token time might be None
|
|
+ first_token_latency = "None"
|
|
+ next_token_latency = "None"
|
|
+ if first_token_time is not None:
|
|
+ first_token_latency = f"{first_token_time * 1000 :.2f} ms"
|
|
+ if next_token_time.size > 0:
|
|
+ next_token_latency = f"{np.mean(next_token_time) * 1000 :.2f} ms"
|
|
+ return first_token_latency, next_token_latency
|
|
+
|
|
+
|
|
def bot_response(
|
|
state,
|
|
temperature,
|
|
@@ -372,7 +386,7 @@ def bot_response(
|
|
if state.skip_next:
|
|
# This generate call is skipped due to invalid inputs
|
|
state.skip_next = False
|
|
- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
|
|
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
|
|
return
|
|
|
|
if apply_rate_limit:
|
|
@@ -381,7 +395,7 @@ def bot_response(
|
|
error_msg = RATE_LIMIT_MSG + "\n\n" + ret["reason"]
|
|
logger.info(f"rate limit reached. ip: {ip}. error_msg: {ret['reason']}")
|
|
state.conv.update_last_message(error_msg)
|
|
- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
|
|
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
|
|
return
|
|
|
|
conv, model_name = state.conv, state.model_name
|
|
@@ -404,6 +418,10 @@ def bot_response(
|
|
yield (
|
|
state,
|
|
state.to_gradio_chatbot(),
|
|
+ "None",
|
|
+ "None",
|
|
+ "None",
|
|
+ "None",
|
|
disable_btn,
|
|
disable_btn,
|
|
disable_btn,
|
|
@@ -444,18 +462,32 @@ def bot_response(
|
|
)
|
|
|
|
conv.update_last_message("▌")
|
|
- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
|
|
+ # We probably need to change this method
|
|
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (disable_btn,) * 5
|
|
+ prompt_tokens = 0
|
|
+ generated_tokens = 0
|
|
+ first_token_latency = None
|
|
+ next_token_latencies = np.array([])
|
|
+ start_time = time.time()
|
|
|
|
try:
|
|
for i, data in enumerate(stream_iter):
|
|
if data["error_code"] == 0:
|
|
+ prompt_tokens = data["usage"]["prompt_tokens"]
|
|
+ generated_tokens = data["usage"]["completion_tokens"]
|
|
output = data["text"].strip()
|
|
conv.update_last_message(output + "▌")
|
|
- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
|
|
+ if first_token_latency is None:
|
|
+ first_token_latency = time.time() - start_time
|
|
+ else:
|
|
+ next_token_latencies = np.append(next_token_latencies, time.time() - start_time)
|
|
+ start_time = time.time()
|
|
+ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
|
|
+ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (disable_btn,) * 5
|
|
else:
|
|
output = data["text"] + f"\n\n(error_code: {data['error_code']})"
|
|
conv.update_last_message(output)
|
|
- yield (state, state.to_gradio_chatbot()) + (
|
|
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
|
|
disable_btn,
|
|
disable_btn,
|
|
disable_btn,
|
|
@@ -465,13 +497,14 @@ def bot_response(
|
|
return
|
|
output = data["text"].strip()
|
|
conv.update_last_message(output)
|
|
- yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
|
|
+ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
|
|
+ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (enable_btn,) * 5
|
|
except requests.exceptions.RequestException as e:
|
|
conv.update_last_message(
|
|
f"{SERVER_ERROR_MSG}\n\n"
|
|
f"(error_code: {ErrorCode.GRADIO_REQUEST_ERROR}, {e})"
|
|
)
|
|
- yield (state, state.to_gradio_chatbot()) + (
|
|
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
|
|
disable_btn,
|
|
disable_btn,
|
|
disable_btn,
|
|
@@ -484,7 +517,7 @@ def bot_response(
|
|
f"{SERVER_ERROR_MSG}\n\n"
|
|
f"(error_code: {ErrorCode.GRADIO_STREAM_UNKNOWN_ERROR}, {e})"
|
|
)
|
|
- yield (state, state.to_gradio_chatbot()) + (
|
|
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
|
|
disable_btn,
|
|
disable_btn,
|
|
disable_btn,
|
|
@@ -646,7 +679,8 @@ def build_single_model_ui(models, add_promotion_links=False):
|
|
)
|
|
|
|
notice_markdown = f"""
|
|
-# 🏔️ Chat with Open Large Language Models
|
|
+# 🏔️ ChatBot based Xeon-W & Arc GPUs
|
|
+### Deployed with IPEX-LLM
|
|
{promotion}
|
|
"""
|
|
|
|
@@ -717,6 +751,22 @@ def build_single_model_ui(models, add_promotion_links=False):
|
|
label="Max output tokens",
|
|
)
|
|
|
|
+ with gr.Row():
|
|
+ with gr.Column():
|
|
+ gr.Markdown("### Performance Metrics")
|
|
+ prompt_token = gr.Label(
|
|
+ label="Prompt token length:",
|
|
+ )
|
|
+ next_token = gr.Label(
|
|
+ label="Generated token length:",
|
|
+ )
|
|
+ first_token_latency = gr.Label(
|
|
+ label="First token Latency:",
|
|
+ )
|
|
+ next_token_latency = gr.Label(
|
|
+ label="Next token Latency:",
|
|
+ )
|
|
+
|
|
if add_promotion_links:
|
|
gr.Markdown(acknowledgment_md, elem_id="ack_markdown")
|
|
|
|
@@ -743,9 +793,9 @@ def build_single_model_ui(models, add_promotion_links=False):
|
|
).then(
|
|
bot_response,
|
|
[state, temperature, top_p, max_output_tokens],
|
|
- [state, chatbot] + btn_list,
|
|
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
|
|
)
|
|
- clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox] + btn_list)
|
|
+ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list)
|
|
|
|
model_selector.change(
|
|
clear_history, None, [state, chatbot, textbox, imagebox] + btn_list
|
|
@@ -758,7 +808,7 @@ def build_single_model_ui(models, add_promotion_links=False):
|
|
).then(
|
|
bot_response,
|
|
[state, temperature, top_p, max_output_tokens],
|
|
- [state, chatbot] + btn_list,
|
|
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
|
|
)
|
|
send_btn.click(
|
|
add_text,
|
|
@@ -767,7 +817,7 @@ def build_single_model_ui(models, add_promotion_links=False):
|
|
).then(
|
|
bot_response,
|
|
[state, temperature, top_p, max_output_tokens],
|
|
- [state, chatbot] + btn_list,
|
|
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
|
|
)
|
|
|
|
return [state, model_selector]
|
|
@@ -775,7 +825,7 @@ def build_single_model_ui(models, add_promotion_links=False):
|
|
|
|
def build_demo(models):
|
|
with gr.Blocks(
|
|
- title="Chat with Open Large Language Models",
|
|
+ title="ChatBot based Xeon-W & Arc GPUs",
|
|
theme=gr.themes.Default(),
|
|
css=block_css,
|
|
) as demo:
|
|
@@ -885,3 +935,4 @@ if __name__ == "__main__":
|
|
auth=auth,
|
|
root_path=args.gradio_root_path,
|
|
)
|
|
+
|