Update oneccl used (#11647)
* Add internal oneccl * fix * fix * add oneccl
This commit is contained in:
		
							parent
							
								
									a4d30a8211
								
							
						
					
					
						commit
						86fc0492f4
					
				
					 6 changed files with 259 additions and 3 deletions
				
			
		| 
						 | 
					@ -1,3 +1,22 @@
 | 
				
			||||||
 | 
					FROM intelanalytics/ipex-llm-serving-xpu:latest as build
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ARG http_proxy
 | 
				
			||||||
 | 
					ARG https_proxy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ADD ./oneccl-binding.patch  /tmp/oneccl-binding.patch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					RUN cd /tmp/ && \
 | 
				
			||||||
 | 
					    pip install --upgrade setuptools wheel twine && \
 | 
				
			||||||
 | 
					    pip install "setuptools<70.0.0" && \
 | 
				
			||||||
 | 
					    git clone https://github.com/intel/torch-ccl -b v2.1.100+xpu && \
 | 
				
			||||||
 | 
					    cd torch-ccl && \
 | 
				
			||||||
 | 
					    patch -p1 < /tmp/oneccl-binding.patch && \
 | 
				
			||||||
 | 
					    git submodule sync && \
 | 
				
			||||||
 | 
					    git submodule update --init --recursive && \
 | 
				
			||||||
 | 
					    COMPUTE_BACKEND=dpcpp python setup.py sdist bdist_wheel && \
 | 
				
			||||||
 | 
					    mv /tmp/torch-ccl/dist/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl /tmp/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
FROM intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT
 | 
					FROM intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ARG http_proxy
 | 
					ARG http_proxy
 | 
				
			||||||
| 
						 | 
					@ -5,12 +24,15 @@ ARG https_proxy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Disable pip's cache behavior
 | 
					# Disable pip's cache behavior
 | 
				
			||||||
ARG PIP_NO_CACHE_DIR=false
 | 
					ARG PIP_NO_CACHE_DIR=false
 | 
				
			||||||
 | 
					COPY --from=build /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl /tmp/
 | 
				
			||||||
 | 
					ADD ./gradio_web_server.patch /tmp/gradio_web_server.patch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Install Serving Dependencies
 | 
					# Install Serving Dependencies
 | 
				
			||||||
# Install ipex-llm[serving] only will update ipex_llm source code without updating
 | 
					# Install ipex-llm[serving] only will update ipex_llm source code without updating
 | 
				
			||||||
# bigdl-core-xe, which will lead to problems
 | 
					# bigdl-core-xe, which will lead to problems
 | 
				
			||||||
RUN apt-get update && \
 | 
					RUN apt-get update && \
 | 
				
			||||||
    apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev && \
 | 
					    apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev && \
 | 
				
			||||||
 | 
					    apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \
 | 
				
			||||||
    pip install --pre --upgrade ipex-llm[xpu,serving] && \
 | 
					    pip install --pre --upgrade ipex-llm[xpu,serving] && \
 | 
				
			||||||
    pip install transformers==4.37.0 gradio==4.19.2 && \
 | 
					    pip install transformers==4.37.0 gradio==4.19.2 && \
 | 
				
			||||||
    # Install vLLM-v2 dependencies
 | 
					    # Install vLLM-v2 dependencies
 | 
				
			||||||
| 
						 | 
					@ -24,7 +46,16 @@ RUN apt-get update && \
 | 
				
			||||||
    pip install transformers_stream_generator einops tiktoken && \
 | 
					    pip install transformers_stream_generator einops tiktoken && \
 | 
				
			||||||
    # For pipeline serving support
 | 
					    # For pipeline serving support
 | 
				
			||||||
    pip install mpi4py fastapi uvicorn openai && \
 | 
					    pip install mpi4py fastapi uvicorn openai && \
 | 
				
			||||||
    pip install gradio # for gradio web UI
 | 
					    # for gradio web UI
 | 
				
			||||||
 | 
					    pip install gradio && \
 | 
				
			||||||
 | 
					    # Install internal oneccl && \
 | 
				
			||||||
 | 
					    cd /tmp/ && \
 | 
				
			||||||
 | 
					    wget https://sourceforge.net/projects/oneccl-wks/files/oneccl_wks_installer_2024.0.0.2.sh && \
 | 
				
			||||||
 | 
					    bash oneccl_wks_installer_2024.0.0.2.sh && \
 | 
				
			||||||
 | 
					    pip uninstall -y oneccl_bind_pt && \
 | 
				
			||||||
 | 
					    pip install /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl && \ 
 | 
				
			||||||
 | 
					    rm /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl && \
 | 
				
			||||||
 | 
					    patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
COPY ./vllm_online_benchmark.py        /llm/
 | 
					COPY ./vllm_online_benchmark.py        /llm/
 | 
				
			||||||
COPY ./vllm_offline_inference.py       /llm/
 | 
					COPY ./vllm_offline_inference.py       /llm/
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										208
									
								
								docker/llm/serving/xpu/docker/gradio_web_server.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										208
									
								
								docker/llm/serving/xpu/docker/gradio_web_server.patch
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,208 @@
 | 
				
			||||||
 | 
					--- gradio_web_server.py	2024-06-20 14:21:48.013518726 +0800
 | 
				
			||||||
 | 
					+++ gradio_web_server_new.py	2024-06-20 14:23:09.822830709 +0800
 | 
				
			||||||
 | 
					@@ -9,8 +9,10 @@
 | 
				
			||||||
 | 
					 import json
 | 
				
			||||||
 | 
					 import os
 | 
				
			||||||
 | 
					 import random
 | 
				
			||||||
 | 
					+import pandas as pd
 | 
				
			||||||
 | 
					 import time
 | 
				
			||||||
 | 
					 import uuid
 | 
				
			||||||
 | 
					+import numpy as np
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					 import gradio as gr
 | 
				
			||||||
 | 
					 import requests
 | 
				
			||||||
 | 
					@@ -241,7 +243,7 @@
 | 
				
			||||||
 | 
					     ip = get_ip(request)
 | 
				
			||||||
 | 
					     logger.info(f"clear_history. ip: {ip}")
 | 
				
			||||||
 | 
					     state = None
 | 
				
			||||||
 | 
					-    return (state, [], "", None) + (disable_btn,) * 5
 | 
				
			||||||
 | 
					+    return (state, [], "", None, "", "", "", "") + (disable_btn,) * 5
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					 def get_ip(request: gr.Request):
 | 
				
			||||||
 | 
					@@ -354,6 +356,18 @@
 | 
				
			||||||
 | 
					         return None
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					+def handle_latency_metrics(first_token_time, next_token_time):
 | 
				
			||||||
 | 
					+    # next token time is a numpy array...
 | 
				
			||||||
 | 
					+    # first token time might be None
 | 
				
			||||||
 | 
					+    first_token_latency = "None"
 | 
				
			||||||
 | 
					+    next_token_latency = "None"
 | 
				
			||||||
 | 
					+    if first_token_time is not None:
 | 
				
			||||||
 | 
					+        first_token_latency = str(first_token_time * 1000) + " ms"
 | 
				
			||||||
 | 
					+    if next_token_time.size > 0:
 | 
				
			||||||
 | 
					+        next_token_latency = str(np.mean(next_token_time) * 1000) + " ms"
 | 
				
			||||||
 | 
					+    return first_token_latency, next_token_latency
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					 def bot_response(
 | 
				
			||||||
 | 
					     state,
 | 
				
			||||||
 | 
					     temperature,
 | 
				
			||||||
 | 
					@@ -372,7 +386,7 @@
 | 
				
			||||||
 | 
					     if state.skip_next:
 | 
				
			||||||
 | 
					         # This generate call is skipped due to invalid inputs
 | 
				
			||||||
 | 
					         state.skip_next = False
 | 
				
			||||||
 | 
					-        yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
 | 
				
			||||||
 | 
					+        yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
 | 
				
			||||||
 | 
					         return
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					     if apply_rate_limit:
 | 
				
			||||||
 | 
					@@ -381,7 +395,7 @@
 | 
				
			||||||
 | 
					             error_msg = RATE_LIMIT_MSG + "\n\n" + ret["reason"]
 | 
				
			||||||
 | 
					             logger.info(f"rate limit reached. ip: {ip}. error_msg: {ret['reason']}")
 | 
				
			||||||
 | 
					             state.conv.update_last_message(error_msg)
 | 
				
			||||||
 | 
					-            yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
 | 
				
			||||||
 | 
					+            yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
 | 
				
			||||||
 | 
					             return
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					     conv, model_name = state.conv, state.model_name
 | 
				
			||||||
 | 
					@@ -404,6 +418,10 @@
 | 
				
			||||||
 | 
					             yield (
 | 
				
			||||||
 | 
					                 state,
 | 
				
			||||||
 | 
					                 state.to_gradio_chatbot(),
 | 
				
			||||||
 | 
					+                "None",
 | 
				
			||||||
 | 
					+                "None",
 | 
				
			||||||
 | 
					+                "None",
 | 
				
			||||||
 | 
					+                "None",
 | 
				
			||||||
 | 
					                 disable_btn,
 | 
				
			||||||
 | 
					                 disable_btn,
 | 
				
			||||||
 | 
					                 disable_btn,
 | 
				
			||||||
 | 
					@@ -444,18 +462,32 @@
 | 
				
			||||||
 | 
					         )
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					     conv.update_last_message("▌")
 | 
				
			||||||
 | 
					-    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
 | 
				
			||||||
 | 
					+    # We probably need to change this method
 | 
				
			||||||
 | 
					+    yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (disable_btn,) * 5
 | 
				
			||||||
 | 
					+    prompt_tokens = 0
 | 
				
			||||||
 | 
					+    generated_tokens = 0
 | 
				
			||||||
 | 
					+    first_token_latency = None
 | 
				
			||||||
 | 
					+    next_token_latencies = np.array([])
 | 
				
			||||||
 | 
					+    start_time = time.time()
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					     try:
 | 
				
			||||||
 | 
					         for i, data in enumerate(stream_iter):
 | 
				
			||||||
 | 
					             if data["error_code"] == 0:
 | 
				
			||||||
 | 
					+                prompt_tokens = data["usage"]["prompt_tokens"]
 | 
				
			||||||
 | 
					+                generated_tokens = data["usage"]["completion_tokens"]
 | 
				
			||||||
 | 
					                 output = data["text"].strip()
 | 
				
			||||||
 | 
					                 conv.update_last_message(output + "▌")
 | 
				
			||||||
 | 
					-                yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
 | 
				
			||||||
 | 
					+                if first_token_latency is None:
 | 
				
			||||||
 | 
					+                    first_token_latency = time.time() - start_time
 | 
				
			||||||
 | 
					+                else:
 | 
				
			||||||
 | 
					+                    next_token_latencies = np.append(next_token_latencies, time.time() - start_time)
 | 
				
			||||||
 | 
					+                start_time = time.time()
 | 
				
			||||||
 | 
					+                first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
 | 
				
			||||||
 | 
					+                yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (disable_btn,) * 5
 | 
				
			||||||
 | 
					             else:
 | 
				
			||||||
 | 
					                 output = data["text"] + f"\n\n(error_code: {data['error_code']})"
 | 
				
			||||||
 | 
					                 conv.update_last_message(output)
 | 
				
			||||||
 | 
					-                yield (state, state.to_gradio_chatbot()) + (
 | 
				
			||||||
 | 
					+                yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
 | 
				
			||||||
 | 
					                     disable_btn,
 | 
				
			||||||
 | 
					                     disable_btn,
 | 
				
			||||||
 | 
					                     disable_btn,
 | 
				
			||||||
 | 
					@@ -465,13 +497,14 @@
 | 
				
			||||||
 | 
					                 return
 | 
				
			||||||
 | 
					         output = data["text"].strip()
 | 
				
			||||||
 | 
					         conv.update_last_message(output)
 | 
				
			||||||
 | 
					-        yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
 | 
				
			||||||
 | 
					+        first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
 | 
				
			||||||
 | 
					+        yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (enable_btn,) * 5
 | 
				
			||||||
 | 
					     except requests.exceptions.RequestException as e:
 | 
				
			||||||
 | 
					         conv.update_last_message(
 | 
				
			||||||
 | 
					             f"{SERVER_ERROR_MSG}\n\n"
 | 
				
			||||||
 | 
					             f"(error_code: {ErrorCode.GRADIO_REQUEST_ERROR}, {e})"
 | 
				
			||||||
 | 
					         )
 | 
				
			||||||
 | 
					-        yield (state, state.to_gradio_chatbot()) + (
 | 
				
			||||||
 | 
					+        yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
 | 
				
			||||||
 | 
					             disable_btn,
 | 
				
			||||||
 | 
					             disable_btn,
 | 
				
			||||||
 | 
					             disable_btn,
 | 
				
			||||||
 | 
					@@ -484,7 +517,7 @@
 | 
				
			||||||
 | 
					             f"{SERVER_ERROR_MSG}\n\n"
 | 
				
			||||||
 | 
					             f"(error_code: {ErrorCode.GRADIO_STREAM_UNKNOWN_ERROR}, {e})"
 | 
				
			||||||
 | 
					         )
 | 
				
			||||||
 | 
					-        yield (state, state.to_gradio_chatbot()) + (
 | 
				
			||||||
 | 
					+        yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
 | 
				
			||||||
 | 
					             disable_btn,
 | 
				
			||||||
 | 
					             disable_btn,
 | 
				
			||||||
 | 
					             disable_btn,
 | 
				
			||||||
 | 
					@@ -646,7 +679,8 @@
 | 
				
			||||||
 | 
					     )
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					     notice_markdown = f"""
 | 
				
			||||||
 | 
					-# 🏔️ Chat with Open Large Language Models
 | 
				
			||||||
 | 
					+# 🏔️ ChatBot based Xeon-W & Arc GPUs
 | 
				
			||||||
 | 
					+###         Deployed with IPEX-LLM
 | 
				
			||||||
 | 
					 {promotion}
 | 
				
			||||||
 | 
					 """
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					@@ -691,6 +725,26 @@
 | 
				
			||||||
 | 
					         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
 | 
				
			||||||
 | 
					         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					+    with gr.Row():
 | 
				
			||||||
 | 
					+        with gr.Column():
 | 
				
			||||||
 | 
					+            gr.Markdown("### Performance Metrics")
 | 
				
			||||||
 | 
					+            prompt_token = gr.Textbox(
 | 
				
			||||||
 | 
					+                label="Prompt token length:",
 | 
				
			||||||
 | 
					+                interactive=False,
 | 
				
			||||||
 | 
					+            )
 | 
				
			||||||
 | 
					+            next_token = gr.Textbox(
 | 
				
			||||||
 | 
					+                label="Generated token length:",
 | 
				
			||||||
 | 
					+                interactive=False,
 | 
				
			||||||
 | 
					+            )
 | 
				
			||||||
 | 
					+            first_token_latency = gr.Textbox(
 | 
				
			||||||
 | 
					+                interactive=False,
 | 
				
			||||||
 | 
					+                label="First token Latency:",
 | 
				
			||||||
 | 
					+            )
 | 
				
			||||||
 | 
					+            next_token_latency = gr.Textbox(
 | 
				
			||||||
 | 
					+                interactive=False,
 | 
				
			||||||
 | 
					+                label="Next token Latency:",
 | 
				
			||||||
 | 
					+            )
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					     with gr.Accordion("Parameters", open=False) as parameter_row:
 | 
				
			||||||
 | 
					         temperature = gr.Slider(
 | 
				
			||||||
 | 
					             minimum=0.0,
 | 
				
			||||||
 | 
					@@ -743,9 +797,9 @@
 | 
				
			||||||
 | 
					     ).then(
 | 
				
			||||||
 | 
					         bot_response,
 | 
				
			||||||
 | 
					         [state, temperature, top_p, max_output_tokens],
 | 
				
			||||||
 | 
					-        [state, chatbot] + btn_list,
 | 
				
			||||||
 | 
					+        [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
 | 
				
			||||||
 | 
					     )
 | 
				
			||||||
 | 
					-    clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox] + btn_list)
 | 
				
			||||||
 | 
					+    clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list)
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					     model_selector.change(
 | 
				
			||||||
 | 
					         clear_history, None, [state, chatbot, textbox, imagebox] + btn_list
 | 
				
			||||||
 | 
					@@ -758,7 +812,7 @@
 | 
				
			||||||
 | 
					     ).then(
 | 
				
			||||||
 | 
					         bot_response,
 | 
				
			||||||
 | 
					         [state, temperature, top_p, max_output_tokens],
 | 
				
			||||||
 | 
					-        [state, chatbot] + btn_list,
 | 
				
			||||||
 | 
					+        [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
 | 
				
			||||||
 | 
					     )
 | 
				
			||||||
 | 
					     send_btn.click(
 | 
				
			||||||
 | 
					         add_text,
 | 
				
			||||||
 | 
					@@ -767,7 +821,7 @@
 | 
				
			||||||
 | 
					     ).then(
 | 
				
			||||||
 | 
					         bot_response,
 | 
				
			||||||
 | 
					         [state, temperature, top_p, max_output_tokens],
 | 
				
			||||||
 | 
					-        [state, chatbot] + btn_list,
 | 
				
			||||||
 | 
					+        [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
 | 
				
			||||||
 | 
					     )
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					     return [state, model_selector]
 | 
				
			||||||
 | 
					@@ -775,7 +829,7 @@
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					 def build_demo(models):
 | 
				
			||||||
 | 
					     with gr.Blocks(
 | 
				
			||||||
 | 
					-        title="Chat with Open Large Language Models",
 | 
				
			||||||
 | 
					+        title="ChatBot based Xeon-W & Arc GPUs",
 | 
				
			||||||
 | 
					         theme=gr.themes.Default(),
 | 
				
			||||||
 | 
					         css=block_css,
 | 
				
			||||||
 | 
					     ) as demo:
 | 
				
			||||||
							
								
								
									
										14
									
								
								docker/llm/serving/xpu/docker/oneccl-binding.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								docker/llm/serving/xpu/docker/oneccl-binding.patch
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,14 @@
 | 
				
			||||||
 | 
					diff --git a/src/gpu/dpcpp_ccl.cpp b/src/gpu/dpcpp_ccl.cpp
 | 
				
			||||||
 | 
					index 3bd8087..c5b5ce3 100644
 | 
				
			||||||
 | 
					--- a/src/gpu/dpcpp_ccl.cpp
 | 
				
			||||||
 | 
					+++ b/src/gpu/dpcpp_ccl.cpp
 | 
				
			||||||
 | 
					@@ -689,7 +689,8 @@ c10::intrusive_ptr<ProcessGroupCCL::AsyncWorkCCL> XPUCCLStubs::allreduce_(std::v
 | 
				
			||||||
 | 
					                                             stream,
 | 
				
			||||||
 | 
					                                             attr), stream.get_native());
 | 
				
			||||||
 | 
					       });
 | 
				
			||||||
 | 
					-    // printf("Use One CCL allreduce.\n");
 | 
				
			||||||
 | 
					+    stream.get_native().wait();
 | 
				
			||||||
 | 
					+   // printf("Use One CCL allreduce.\n");
 | 
				
			||||||
 | 
					     return ret_evt;
 | 
				
			||||||
 | 
					   },
 | 
				
			||||||
 | 
					   c10d::OpType::ALLREDUCE);
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
cd /llm/lightweight_serving
 | 
					cd /llm/lightweight_serving
 | 
				
			||||||
model_path="/llm/models/Llama-2-7b-chat-hf"
 | 
					model_path="/llm/models/Llama-2-7b-chat-hf"
 | 
				
			||||||
low_bit="sym_int4"
 | 
					low_bit="sym_int4"
 | 
				
			||||||
 | 
					source /opt/intel/1ccl-wks/setvars.sh
 | 
				
			||||||
python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit
 | 
					python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit
 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,8 @@ export OMP_NUM_THREADS=32
 | 
				
			||||||
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 | 
					export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 | 
				
			||||||
basekit_root=/opt/intel/oneapi
 | 
					basekit_root=/opt/intel/oneapi
 | 
				
			||||||
source $basekit_root/setvars.sh --force
 | 
					source $basekit_root/setvars.sh --force
 | 
				
			||||||
source $basekit_root/ccl/latest/env/vars.sh --force
 | 
					# source $basekit_root/ccl/latest/env/vars.sh --force
 | 
				
			||||||
 | 
					source /opt/intel/1ccl-wks/setvars.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export USE_XETLA=OFF
 | 
					export USE_XETLA=OFF
 | 
				
			||||||
if [[ $KERNEL_VERSION != *"6.5"* ]]; then
 | 
					if [[ $KERNEL_VERSION != *"6.5"* ]]; then
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,6 +2,7 @@
 | 
				
			||||||
model="YOUR_MODEL_PATH"
 | 
					model="YOUR_MODEL_PATH"
 | 
				
			||||||
served_model_name="YOUR_MODEL_NAME"
 | 
					served_model_name="YOUR_MODEL_NAME"
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 | 
					source /opt/intel/1ccl-wks/setvars.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
 | 
					python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
 | 
				
			||||||
  --served-model-name $served_model_name \
 | 
					  --served-model-name $served_model_name \
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue