Add lightweight-serving whisper asr example (#11847)

* add asr init * update for pp * update style * update readme * update reamde
2024-08-22 15:46:28 +08:00 · 2024-08-22 15:46:28 +08:00 · 5c4ed00593
commit 5c4ed00593
parent a8e2573421
6 changed files with 177 additions and 54 deletions
--- a/python/llm/example/GPU/Lightweight-Serving/README.md
+++ b/python/llm/example/GPU/Lightweight-Serving/README.md
@ -22,6 +22,10 @@ conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc
 # for internlm-xcomposer2-vl-7b
 pip install transformers==4.31.0
 pip install accelerate timm==0.4.12 sentencepiece==0.1.99 gradio==3.44.4 markdown2==2.4.10 xlsxwriter==3.1.2 einops
+
+# for whisper-large-v3
+pip install transformers==4.36.2
+pip install datasets soundfile librosa # required by audio processing
 ```

 #### 1.2 Installation on Windows
@ -35,6 +39,14 @@ pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-exte
 pip install fastapi uvicorn openai
 pip install gradio # for gradio web UI
 conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc
+
+# for internlm-xcomposer2-vl-7b
+pip install transformers==4.31.0
+pip install accelerate timm==0.4.12 sentencepiece==0.1.99 gradio==3.44.4 markdown2==2.4.10 xlsxwriter==3.1.2 einops
+
+# for whisper-large-v3
+pip install transformers==4.36.2
+pip install datasets soundfile librosa # required by audio processing
 ```

 ### 2. Configures OneAPI environment variables for Linux
@ -180,7 +192,7 @@ curl http://localhost:8000/v1/chat/completions \

 image input only supports [internlm-xcomposer2-vl-7b](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) now, and it must install transformers==4.31.0 to run.
 ```bash
-wget -O ./test.jpg http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg
+wget -O /llm/lightweight_serving/test.jpg http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg
 curl http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
@ -219,6 +231,17 @@ curl http://localhost:8000/v1/completions \
  }'
 ```

+#### v1/audio/transcriptions
+
+ASR only supports [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) now. And `whisper-large-v3` just can be used to transcription audio. The audio file_type should be supported by `librosa.load`.
+```bash
+curl http://localhost:8000/v1/audio/transcriptions \
+  -H "Content-Type: multipart/form-data" \
+  -F file="@/llm/test.mp3" \
+  -F model="whisper-large-v3" \
+  -F languag="zh"
+```
+
 ### 6. Benchmark with wrk

 Please refer to [here](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-Serving#4-benchmark-with-wrk) for more details
--- a/python/llm/example/GPU/Lightweight-Serving/lightweight_serving.py
+++ b/python/llm/example/GPU/Lightweight-Serving/lightweight_serving.py
@ -39,12 +39,19 @@ async def main():
    model_path = args.repo_id_or_model_path
    low_bit = args.low_bit

-    local_model = ModelWorker(model_path, low_bit)
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, padding_side='left')
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    myapp = FastApp(local_model, tokenizer)
+    processor = None
+    if "whisper" not in model_path.lower():
+        local_model = ModelWorker(model_path, low_bit)
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, padding_side='left')
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+    else:
+        local_model = ModelWorker(model_path, low_bit, "audio", torch_dtype=torch.float32)
+        from transformers import WhisperProcessor
+        processor = WhisperProcessor.from_pretrained(model_path)
+        tokenizer = processor.tokenizer
+    myapp = FastApp(local_model, tokenizer, processor)
    config = uvicorn.Config(app=myapp.app, host="0.0.0.0", port=args.port)
    server = uvicorn.Server(config)
    await server.serve()
--- a/python/llm/src/ipex_llm/serving/fastapi/api_server.py
+++ b/python/llm/src/ipex_llm/serving/fastapi/api_server.py
@ -27,6 +27,8 @@ import uuid
 from typing import List, Optional, Union, Dict
 from fastapi.middleware.cors import CORSMiddleware
 from .tgi_protocol import Parameters
+from typing_extensions import Literal
+from fastapi import File, UploadFile, Form
 from .openai_protocol import (
    ChatCompletionResponseStreamChoice,
    ChatCompletionStreamResponse,
@ -38,6 +40,8 @@ from .openai_protocol import (
    CompletionResponse,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
 )

 result_dict: Dict[str, str] = {}
@ -50,6 +54,7 @@ class InputsRequest(BaseModel):
    image_list: Optional[list] = None
    stream: Optional[bool] = False
    req_type: str = 'completion'
+    transcription_request:  Optional[TranscriptionRequest] = None


 class ChatCompletionRequest(BaseModel):
@ -92,20 +97,27 @@ app.add_middleware(

 global tokenizer
 global local_model
+global processor


 class FastApp():
-    def __init__(self, model, mytokenizer):
+    def __init__(self, model, mytokenizer, myprocessor=None):
        global tokenizer
        global local_model
+        global processor
        local_model = model
        tokenizer = mytokenizer
+        processor = myprocessor
        self.app = app


 def get_queue_next_token(delta_text_queue):
    timeout = int(os.getenv("IPEX_LLM_FASTAPI_TIMEOUT", 60))
    delta_text = delta_text_queue.text_queue.get(timeout=timeout)
+    if "whisper" in local_model.model_name.lower():
+        if delta_text is not None and "<|" in delta_text and "|>" in delta_text:
+            import re
+            delta_text = re.sub(r'<\|.*?\|>', '', delta_text)
    if delta_text is None:
        remain = 0
    else:
@ -385,6 +397,32 @@ async def create_completion(request: CompletionRequest):
    return result


+@app.post("/v1/audio/transcriptions")
+async def transcriptions(
+    file: UploadFile=File(...),
+    model: Optional[str]=Form("default_model"),
+    language: Optional[str]=Form("zh"),
+    prompt: Optional[str]=Form(None),
+    response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]]=Form(None),
+    temperature: Optional[float]=Form(None),
+    timestamp_granularities: Optional[List[Literal["word", "segment"]]]=Form(None)
+):
+    file_path = "./" + file.filename
+    if not os.path.exists(file_path):
+        with open(file_path, "wb") as f:
+            f.write(await file.read())
+    inputs_request = InputsRequest(
+        inputs="transcriptions",
+        parameters=None,
+        stream=False,
+        req_type="completion",
+        transcription_request=TranscriptionRequest(file=file_path, model=model, language=language)
+    )
+    request_id, result = await generate(inputs_request)
+    rsp = TranscriptionResponse(text=result)
+    return rsp
+
+
@app.on_event("startup")
 async def startup_event():
    asyncio.create_task(process_requests(local_model, result_dict))
@ -393,4 +431,4 @@ async def startup_event():
 async def process_requests(local_model, result_dict):
    while True:
        await asyncio.sleep(0)
-        await local_model.process_step(tokenizer, result_dict)
+        await local_model.process_step(tokenizer, result_dict, processor)
--- a/python/llm/src/ipex_llm/serving/fastapi/model_worker.py
+++ b/python/llm/src/ipex_llm/serving/fastapi/model_worker.py
@ -23,37 +23,69 @@ logger = logging.get_logger(__name__)


 class ModelWorker:
-    def __init__(self, checkpoint, low_bit, torch_dtype=torch.float16):
+    def __init__(self, checkpoint, low_bit, model_type="normal", torch_dtype=torch.float16):
        self.dtype = torch_dtype
        start = time.perf_counter()
-        model = self.load_model(checkpoint, low_bit)
-        from ipex_llm.utils import BenchmarkWrapper
-        self.model = BenchmarkWrapper(model, do_print=True)
+        if model_type == "audio":
+            self.model = self.load_model(checkpoint, low_bit, "audio")
+        else:
+            model = self.load_model(checkpoint, low_bit)
+            from ipex_llm.utils import BenchmarkWrapper
+            self.model = BenchmarkWrapper(model, do_print=True)
        end = time.perf_counter()
        logger.info(f"Time to load weights: {end - start:.2f}s")
        self.waiting_requests = asyncio.Queue()
        self.streamer = {}
        self.model_name = checkpoint

-    def load_model(self, model_path, low_bit='sym_int4'):
-        from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
-        try:
-            model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                         load_in_low_bit=low_bit,
-                                                         torch_dtype=self.dtype,
-                                                         optimize_model=True,
-                                                         trust_remote_code=True,
-                                                         use_cache=True,)
-        except:
-            model = AutoModel.from_pretrained(model_path,
-                                              load_in_low_bit=low_bit,
-                                              torch_dtype=self.dtype,
-                                              optimize_model=True,
-                                              trust_remote_code=True,
-                                              use_cache=True,)
+    def load_model(self, model_path, low_bit='sym_int4', model_type="normal"):
+        if model_type == "audio":
+            from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
+            model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path,
+                                                              load_in_low_bit=low_bit,
+                                                              torch_dtype=self.dtype,
+                                                              optimize_model=True,
+                                                              trust_remote_code=True,
+                                                              use_cache=True)
+        else:
+            from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
+            try:
+                model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                             load_in_low_bit=low_bit,
+                                                             torch_dtype=self.dtype,
+                                                             optimize_model=True,
+                                                             trust_remote_code=True,
+                                                             use_cache=True,)
+            except:
+                model = AutoModel.from_pretrained(model_path,
+                                                  load_in_low_bit=low_bit,
+                                                  torch_dtype=self.dtype,
+                                                  optimize_model=True,
+                                                  trust_remote_code=True,
+                                                  use_cache=True,)
        model = model.eval().to("xpu")
        return model

+    async def add_asr_request(self, processor):
+        if self.waiting_requests.empty():
+            return
+        tmp_result = await self.waiting_requests.get()
+        request_id, request = tmp_result
+        transcription_request = request.transcription_request
+        forced_decoder_ids = processor.get_decoder_prompt_ids(
+            language=transcription_request.language, task="transcribe")
+        audio_path = transcription_request.file
+        import librosa
+        raw_speech, sampling_rate = librosa.load(audio_path,
+                                                 sr=processor.feature_extractor.sampling_rate)
+        input_features = processor(
+            raw_speech,
+            sampling_rate=sampling_rate,
+            return_tensors="pt",
+            return_attention_mask=True,
+        ).input_features.to('xpu')
+        return input_features, forced_decoder_ids, request_id
+
    async def add_request(self, tokenizer):
        if self.waiting_requests.empty():
            return
@ -91,33 +123,41 @@ class ModelWorker:
        return input_ids, parameters, request_id, inputs_embeds

    @torch.no_grad()
-    async def process_step(self, tokenizer, result_dict):
+    async def process_step(self, tokenizer, result_dict, processor=None):
        if not self.waiting_requests.empty():
-            input_ids, parameters, request_id, inputs_embeds = await self.add_request(tokenizer)
-            self.streamer[request_id] = TextIteratorStreamer(tokenizer, skip_prompt=True)
+            if processor is not None and "whisper" in self.model_name.lower():
+                input_features, decoder_ids, request_id = await self.add_asr_request(processor)
+                self.streamer[request_id] = TextIteratorStreamer(tokenizer, skip_prompt=True)

-            def model_generate():
-                generate_kwargs = {k: v for k, v in parameters.dict().items() if v is not None}
-                if "codegeex" in self.model_name.lower():
-                    eos_token_id = [tokenizer.eos_token_id,
-                                    tokenizer.convert_tokens_to_ids("<|user|>"),
-                                    tokenizer.convert_tokens_to_ids("<|observation|>")]
-                    generate_kwargs["eos_token_id"] = eos_token_id
-                elif "internlm-xcomposer2-vl-7b" in self.model_name.lower():
-                    eos_token_id = [
-                        tokenizer.eos_token_id,
-                        tokenizer.convert_tokens_to_ids(['[UNUSED_TOKEN_145]'])[0]
-                    ]
-                    generate_kwargs["eos_token_id"] = eos_token_id
-                if input_ids is not None:
-                    self.model.generate(input_ids,
-                                        streamer=self.streamer[request_id], **generate_kwargs)
-                elif inputs_embeds is not None:
-                    self.model.generate(inputs_embeds=inputs_embeds,
-                                        streamer=self.streamer[request_id], **generate_kwargs)
-                torch.xpu.empty_cache()
-                torch.xpu.synchronize()
+                def model_generate():
+                    self.model.generate(input_features,
+                                        streamer=self.streamer[request_id],
+                                        forced_decoder_ids=decoder_ids)
+            else:
+                input_ids, parameters, request_id, inputs_embeds = await self.add_request(tokenizer)
+                self.streamer[request_id] = TextIteratorStreamer(tokenizer, skip_prompt=True)

+                def model_generate():
+                    generate_kwargs = {k: v for k, v in parameters.dict().items() if v is not None}
+                    if "codegeex" in self.model_name.lower():
+                        eos_token_id = [tokenizer.eos_token_id,
+                                        tokenizer.convert_tokens_to_ids("<|user|>"),
+                                        tokenizer.convert_tokens_to_ids("<|observation|>")]
+                        generate_kwargs["eos_token_id"] = eos_token_id
+                    elif "internlm-xcomposer2-vl-7b" in self.model_name.lower():
+                        eos_token_id = [
+                            tokenizer.eos_token_id,
+                            tokenizer.convert_tokens_to_ids(['[UNUSED_TOKEN_145]'])[0]
+                        ]
+                        generate_kwargs["eos_token_id"] = eos_token_id
+                    if input_ids is not None:
+                        self.model.generate(input_ids,
+                                            streamer=self.streamer[request_id], **generate_kwargs)
+                    elif inputs_embeds is not None:
+                        self.model.generate(inputs_embeds=inputs_embeds,
+                                            streamer=self.streamer[request_id], **generate_kwargs)
+            torch.xpu.empty_cache()
+            torch.xpu.synchronize()
            from threading import Thread
            t1 = Thread(target=model_generate)
            t1.start()
--- a/python/llm/src/ipex_llm/serving/fastapi/openai_protocol.py
+++ b/python/llm/src/ipex_llm/serving/fastapi/openai_protocol.py
@ -24,6 +24,7 @@ from openai.types.chat import ChatCompletionMessageParam
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Annotated
 from ipex_llm.utils.common import invalidInputError
+from typing_extensions import Literal


 # from vllm.sampling_params import SamplingParams
@ -31,6 +32,20 @@ def random_uuid() -> str:
    return str(uuid.uuid4().hex)


+class TranscriptionRequest(BaseModel):
+    file: str = None
+    model: Optional[str] = "default_model"
+    language: Optional[str] = "zh"
+    prompt: Optional[str] = None
+    response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]] = None
+    temperature: Optional[float] = None
+    timestamp_granularities: Optional[List[Literal["word", "segment"]]] = None
+
+
+class TranscriptionResponse(BaseModel):
+    text: str
+
+
 class OpenAIBaseModel(BaseModel):
    # OpenAI API does not allow extra fields
    model_config = ConfigDict(extra="forbid")
--- a/python/llm/src/ipex_llm/transformers/pipeline_parallel.py
+++ b/python/llm/src/ipex_llm/transformers/pipeline_parallel.py
@ -800,7 +800,7 @@ class PPModelWorker:
                    _stream_tasks.append(self.streamer[request_id].put((remain, printable_text)))
        await asyncio.gather(*_stream_tasks)

-    async def process_step(self, tokenizer, result_dict):
+    async def process_step(self, tokenizer, result_dict, processor=None):
        cur_batch = None
        torch.xpu.synchronize(self.device)
        if self.rank == 0: