* Add ipex-llm side change * add runable offline_inference * refactor to call vllm2 * Verified async server * add new v2 example * add README * fix * change dir * refactor readme.md * add experimental * fix
60 lines
2.2 KiB
Python
60 lines
2.2 KiB
Python
#
|
|
# Copyright 2016 The BigDL Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# Some parts of this file is adapted from
|
|
# https://github.com/vllm-project/vllm/blob/v0.2.1.post1/examples/offline_inference.py
|
|
# which is licensed under Apache License 2.0
|
|
#
|
|
# Copyright 2023 The vLLM team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from vllm import SamplingParams
|
|
from ipex_llm.vllm2.engine import IPEXLLMClass as LLM
|
|
|
|
# Sample prompts.
|
|
prompts = [
|
|
"Hello, my name is",
|
|
"The president of the United States is",
|
|
"The capital of France is",
|
|
"The future of AI is",
|
|
]
|
|
# Create a sampling params object.
|
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|
|
|
# Create an LLM.
|
|
llm = LLM(model="YOUR_MODEL",
|
|
device="xpu",
|
|
dtype="float16",
|
|
enforce_eager=True,
|
|
load_in_low_bit="sym_int4")
|
|
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
|
# that contain the prompt, generated text, and other information.
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
# Print the outputs.
|
|
for output in outputs:
|
|
prompt = output.prompt
|
|
generated_text = output.outputs[0].text
|
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|