Add vllm api_server input output log (#12962)
This commit is contained in:
parent
3941f322c5
commit
c8a0462507
1 changed files with 45 additions and 1 deletions
|
|
@ -361,6 +361,33 @@ async def show_version():
|
||||||
return JSONResponse(content=ver)
|
return JSONResponse(content=ver)
|
||||||
|
|
||||||
|
|
||||||
|
save_dict = {}
|
||||||
|
import os
|
||||||
|
flag = os.getenv("VLLM_LOG_OUTPUT", None)
|
||||||
|
async def stream_generator(generator, request, request_id):
|
||||||
|
async for chunk in generator:
|
||||||
|
if request_id not in save_dict:
|
||||||
|
save_dict[request_id] = ""
|
||||||
|
import json
|
||||||
|
try:
|
||||||
|
data = chunk.strip()
|
||||||
|
if data.startswith('data: '):
|
||||||
|
data = data[len('data: '):]
|
||||||
|
else:
|
||||||
|
yield chunk
|
||||||
|
json_data = json.loads(data)
|
||||||
|
if 'choices' in json_data and len(json_data['choices']) > 0:
|
||||||
|
choice = json_data['choices'][0]
|
||||||
|
if 'delta' in choice:
|
||||||
|
save_dict[request_id] += choice["delta"]["content"]
|
||||||
|
elif 'text' in choice:
|
||||||
|
save_dict[request_id] += choice["text"]
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print(f"Received request_id: {request_id}, request: {request} content: {save_dict[request_id]}")
|
||||||
|
pass # Done
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
|
||||||
@router.post("/v1/chat/completions")
|
@router.post("/v1/chat/completions")
|
||||||
@with_cancellation
|
@with_cancellation
|
||||||
async def create_chat_completion(request: ChatCompletionRequest,
|
async def create_chat_completion(request: ChatCompletionRequest,
|
||||||
|
|
@ -370,6 +397,11 @@ async def create_chat_completion(request: ChatCompletionRequest,
|
||||||
return base(raw_request).create_error_response(
|
return base(raw_request).create_error_response(
|
||||||
message="The model does not support Chat Completions API")
|
message="The model does not support Chat Completions API")
|
||||||
|
|
||||||
|
if flag is not None:
|
||||||
|
request_id = "chatcmpl-" \
|
||||||
|
f"{handler._base_request_id(raw_request, request.request_id)}"
|
||||||
|
print(f"First received request_id: {request_id}, request: {request}")
|
||||||
|
|
||||||
generator = await handler.create_chat_completion(request, raw_request)
|
generator = await handler.create_chat_completion(request, raw_request)
|
||||||
|
|
||||||
if isinstance(generator, ErrorResponse):
|
if isinstance(generator, ErrorResponse):
|
||||||
|
|
@ -377,8 +409,12 @@ async def create_chat_completion(request: ChatCompletionRequest,
|
||||||
status_code=generator.code)
|
status_code=generator.code)
|
||||||
|
|
||||||
elif isinstance(generator, ChatCompletionResponse):
|
elif isinstance(generator, ChatCompletionResponse):
|
||||||
|
if flag is not None:
|
||||||
|
print(f"Received request-id:{request_id}, request:{request}, Output:{generator.model_dump()}")
|
||||||
return JSONResponse(content=generator.model_dump())
|
return JSONResponse(content=generator.model_dump())
|
||||||
|
|
||||||
|
if flag is not None:
|
||||||
|
return StreamingResponse(content=stream_generator(generator, request, request_id), media_type="text/event-stream")
|
||||||
return StreamingResponse(content=generator, media_type="text/event-stream")
|
return StreamingResponse(content=generator, media_type="text/event-stream")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -390,13 +426,21 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
||||||
return base(raw_request).create_error_response(
|
return base(raw_request).create_error_response(
|
||||||
message="The model does not support Completions API")
|
message="The model does not support Completions API")
|
||||||
|
|
||||||
|
if flag is not None:
|
||||||
|
request_id = f"cmpl-{handler._base_request_id(raw_request)}"
|
||||||
|
print(f"First received request_id: {request_id}, request: {request}")
|
||||||
|
|
||||||
generator = await handler.create_completion(request, raw_request)
|
generator = await handler.create_completion(request, raw_request)
|
||||||
if isinstance(generator, ErrorResponse):
|
if isinstance(generator, ErrorResponse):
|
||||||
return JSONResponse(content=generator.model_dump(),
|
return JSONResponse(content=generator.model_dump(),
|
||||||
status_code=generator.code)
|
status_code=generator.code)
|
||||||
elif isinstance(generator, CompletionResponse):
|
elif isinstance(generator, CompletionResponse):
|
||||||
|
if flag is not None:
|
||||||
|
print(f"Received request-id:{request_id}, request:{request}, Output:{generator.model_dump()}")
|
||||||
return JSONResponse(content=generator.model_dump())
|
return JSONResponse(content=generator.model_dump())
|
||||||
|
|
||||||
|
if flag is not None:
|
||||||
|
return StreamingResponse(content=stream_generator(generator, request, request_id), media_type="text/event-stream")
|
||||||
return StreamingResponse(content=generator, media_type="text/event-stream")
|
return StreamingResponse(content=generator, media_type="text/event-stream")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue