Enable fastchat benchmark latency (#11017)

* enable fastchat benchmark * add readme * update readme * update
2024-05-15 14:52:09 +08:00 · 2024-05-15 14:52:09 +08:00 · 2084ebe4ee
commit 2084ebe4ee
parent 93d40ab127
1 changed files with 9 additions and 0 deletions
--- a/python/llm/src/ipex_llm/serving/fastchat/ipex_llm_worker.py
+++ b/python/llm/src/ipex_llm/serving/fastchat/ipex_llm_worker.py
@ -71,6 +71,7 @@ class BigDLLLMWorker(BaseModelWorker):
        speculative: bool = False,
        load_low_bit_model: bool = False,
        stream_interval: int = 4,
+        benchmark: str = "true",
    ):
        super().__init__(
            controller_addr,
@ -103,6 +104,10 @@ class BigDLLLMWorker(BaseModelWorker):
            speculative,
            load_low_bit_model,
        )
+        if benchmark.lower() == "true":
+            from ipex_llm.utils.benchmark_util import BenchmarkWrapper
+            self.model = BenchmarkWrapper(self.model, do_print=True)
+            logger.info(f"enable benchmark successfully")
        self.stream_interval = stream_interval
        self.context_len = get_context_length(self.model.config)
        self.embed_in_truncate = embed_in_truncate
@ -495,6 +500,9 @@ if __name__ == "__main__":
        default=False,
        help="To use self-speculative or not",
    )
+    parser.add_argument(
+        "--benchmark", type=str, default="true", help="To print model generation latency or not"
+    )
    parser.add_argument(
        "--trust-remote-code",
        action="store_true",
@ -527,5 +535,6 @@ if __name__ == "__main__":
        args.speculative,
        args.load_low_bit_model,
        args.stream_interval,
+        args.benchmark,
    )
    uvicorn.run(app, host=args.host, port=args.port, log_level="info")