* add vllm * done * doc work * fix done * temp * add docs * format * add start-fastchat-service.sh * fix
		
			
				
	
	
		
			61 lines
		
	
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			61 lines
		
	
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
#
 | 
						|
# Copyright 2016 The BigDL Authors.
 | 
						|
#
 | 
						|
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
# you may not use this file except in compliance with the License.
 | 
						|
# You may obtain a copy of the License at
 | 
						|
#
 | 
						|
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
#
 | 
						|
# Unless required by applicable law or agreed to in writing, software
 | 
						|
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
# See the License for the specific language governing permissions and
 | 
						|
# limitations under the License.
 | 
						|
#
 | 
						|
# Some parts of this file is adapted from
 | 
						|
# https://github.com/vllm-project/vllm/blob/v0.2.1.post1/examples/offline_inference.py
 | 
						|
# which is licensed under Apache License 2.0
 | 
						|
#
 | 
						|
# Copyright 2023 The vLLM team. All rights reserved.
 | 
						|
#
 | 
						|
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
# you may not use this file except in compliance with the License.
 | 
						|
# You may obtain a copy of the License at
 | 
						|
#
 | 
						|
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
#
 | 
						|
# Unless required by applicable law or agreed to in writing, software
 | 
						|
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
# See the License for the specific language governing permissions and
 | 
						|
# limitations under the License.
 | 
						|
 | 
						|
from vllm import SamplingParams
 | 
						|
from ipex_llm.vllm.engine import IPEXLLMClass as LLM
 | 
						|
 | 
						|
# Sample prompts.
 | 
						|
prompts = [
 | 
						|
    "Hello, my name is",
 | 
						|
    "The president of the United States is",
 | 
						|
    "The capital of France is",
 | 
						|
    "The future of AI is",
 | 
						|
]
 | 
						|
# Create a sampling params object.
 | 
						|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 | 
						|
 | 
						|
# Create an LLM.
 | 
						|
llm = LLM(model="YOUR_MODEL",
 | 
						|
          device="xpu",
 | 
						|
          dtype="float16",
 | 
						|
          enforce_eager=True,
 | 
						|
          load_in_low_bit="sym_int4",
 | 
						|
          tensor_parallel_size=1)
 | 
						|
# Generate texts from the prompts. The output is a list of RequestOutput objects
 | 
						|
# that contain the prompt, generated text, and other information.
 | 
						|
outputs = llm.generate(prompts, sampling_params)
 | 
						|
# Print the outputs.
 | 
						|
for output in outputs:
 | 
						|
    prompt = output.prompt
 | 
						|
    generated_text = output.outputs[0].text
 | 
						|
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 |