* test * test * fix last kv cache * add xpu readme * remove numactl for xpu example * fix link error * update max_num_batched_tokens logic * add explaination * add xpu environement version requirement * refine gpu memory * fix * fix style
		
			
				
	
	
		
			57 lines
		
	
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			57 lines
		
	
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
#
 | 
						|
# Copyright 2016 The BigDL Authors.
 | 
						|
#
 | 
						|
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
# you may not use this file except in compliance with the License.
 | 
						|
# You may obtain a copy of the License at
 | 
						|
#
 | 
						|
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
#
 | 
						|
# Unless required by applicable law or agreed to in writing, software
 | 
						|
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
# See the License for the specific language governing permissions and
 | 
						|
# limitations under the License.
 | 
						|
#
 | 
						|
# Some parts of this file is adapted from
 | 
						|
# https://github.com/vllm-project/vllm/blob/v0.2.1.post1/examples/offline_inference.py
 | 
						|
# which is licensed under Apache License 2.0
 | 
						|
#
 | 
						|
# Copyright 2023 The vLLM team. All rights reserved.
 | 
						|
#
 | 
						|
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
# you may not use this file except in compliance with the License.
 | 
						|
# You may obtain a copy of the License at
 | 
						|
#
 | 
						|
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
#
 | 
						|
# Unless required by applicable law or agreed to in writing, software
 | 
						|
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
# See the License for the specific language governing permissions and
 | 
						|
# limitations under the License.
 | 
						|
 | 
						|
from bigdl.llm.vllm.entrypoints.llm import LLM
 | 
						|
from bigdl.llm.vllm.sampling_params import SamplingParams
 | 
						|
 | 
						|
# Sample prompts.
 | 
						|
prompts = [
 | 
						|
    "Hello, my name is",
 | 
						|
    "The president of the United States is",
 | 
						|
    "The capital of France is",
 | 
						|
    "The future of AI is",
 | 
						|
]
 | 
						|
# Create a sampling params object.
 | 
						|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 | 
						|
 | 
						|
# Create an LLM.
 | 
						|
# llm = LLM(model="facebook/opt-125m")
 | 
						|
llm = LLM(model="YOUR_MODEL_PATH", dtype="bfloat16", device="xpu")
 | 
						|
# Generate texts from the prompts. The output is a list of RequestOutput objects
 | 
						|
# that contain the prompt, generated text, and other information.
 | 
						|
outputs = llm.generate(prompts, sampling_params)
 | 
						|
# Print the outputs.
 | 
						|
for output in outputs:
 | 
						|
    prompt = output.prompt
 | 
						|
    generated_text = output.outputs[0].text
 | 
						|
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 |