Fix the not stop issue of llama3 examples (#10860)
* fix not stop issue in GPU/HF-Transformers-AutoModels * fix not stop issue in GPU/PyTorch-Models/Model/llama3 * fix not stop issue in CPU/HF-Transformers-AutoModels/Model/llama3 * fix not stop issue in CPU/PyTorch-Models/Model/llama3 * update the output in readme * update format * add reference * update prompt format * update output format in readme * update example output in readme
This commit is contained in:
parent
5c9eb5d0f5
commit
328b1a1de9
8 changed files with 70 additions and 28 deletions
|
|
@ -57,12 +57,16 @@ numactl -C 0-47 -m 0 python ./generate.py
|
|||
Inference time: xxxx s
|
||||
-------------------- Prompt --------------------
|
||||
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
|
||||
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
|
||||
-------------------- Output (skip_special_tokens=False) --------------------
|
||||
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
A question that gets to the heart of the 21st century!
|
||||
|
||||
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as:
|
||||
|
||||
1. Learning: AI
|
||||
```
|
||||
|
|
|
|||
|
|
@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
|
|||
prompt_texts = [f'<|begin_of_text|>']
|
||||
|
||||
if system_prompt != '':
|
||||
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')
|
||||
|
||||
for history_input, history_response in chat_history:
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')
|
||||
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
|
||||
return ''.join(prompt_texts)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
@ -63,6 +63,12 @@ if __name__ == '__main__':
|
|||
|
||||
# Load tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
|
||||
terminators = [
|
||||
tokenizer.eos_token_id,
|
||||
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
|
||||
]
|
||||
|
||||
# Generate predicted tokens
|
||||
with torch.inference_mode():
|
||||
|
|
@ -70,6 +76,7 @@ if __name__ == '__main__':
|
|||
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
||||
st = time.time()
|
||||
output = model.generate(input_ids,
|
||||
eos_token_id=terminators,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
output_str = tokenizer.decode(output[0], skip_special_tokens=False)
|
||||
|
|
|
|||
|
|
@ -57,12 +57,16 @@ In the example, several arguments can be passed to satisfy your requirements:
|
|||
Inference time: xxxx s
|
||||
-------------------- Prompt --------------------
|
||||
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
|
||||
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
|
||||
-------------------- Output (skip_special_tokens=False) --------------------
|
||||
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
A question that gets to the heart of the 21st century!
|
||||
|
||||
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as:
|
||||
|
||||
1. Learning: AI
|
||||
```
|
||||
|
|
|
|||
|
|
@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
|
|||
prompt_texts = [f'<|begin_of_text|>']
|
||||
|
||||
if system_prompt != '':
|
||||
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')
|
||||
|
||||
for history_input, history_response in chat_history:
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')
|
||||
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
|
||||
return ''.join(prompt_texts)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
@ -65,6 +65,12 @@ if __name__ == '__main__':
|
|||
|
||||
# Load tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
|
||||
terminators = [
|
||||
tokenizer.eos_token_id,
|
||||
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
|
||||
]
|
||||
|
||||
# Generate predicted tokens
|
||||
with torch.inference_mode():
|
||||
|
|
@ -72,6 +78,7 @@ if __name__ == '__main__':
|
|||
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
||||
st = time.time()
|
||||
output = model.generate(input_ids,
|
||||
eos_token_id=terminators,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
output_str = tokenizer.decode(output[0], skip_special_tokens=False)
|
||||
|
|
|
|||
|
|
@ -125,12 +125,14 @@ Arguments info:
|
|||
Inference time: xxxx s
|
||||
-------------------- Prompt --------------------
|
||||
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
|
||||
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
|
||||
-------------------- Output (skip_special_tokens=False) --------------------
|
||||
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
A question that gets to the heart of the 21st century!
|
||||
|
||||
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as learning, problem-solving, decision
|
||||
```
|
||||
|
|
|
|||
|
|
@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
|
|||
prompt_texts = [f'<|begin_of_text|>']
|
||||
|
||||
if system_prompt != '':
|
||||
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')
|
||||
|
||||
for history_input, history_response in chat_history:
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')
|
||||
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
|
||||
return ''.join(prompt_texts)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
@ -67,17 +67,25 @@ if __name__ == '__main__':
|
|||
# Load tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
|
||||
terminators = [
|
||||
tokenizer.eos_token_id,
|
||||
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
|
||||
]
|
||||
|
||||
# Generate predicted tokens
|
||||
with torch.inference_mode():
|
||||
prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
|
||||
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
|
||||
# ipex_llm model needs a warmup, then inference time can be accurate
|
||||
output = model.generate(input_ids,
|
||||
eos_token_id=terminators,
|
||||
max_new_tokens=args.n_predict)
|
||||
|
||||
# start inference
|
||||
st = time.time()
|
||||
output = model.generate(input_ids,
|
||||
eos_token_id=terminators,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -126,12 +126,14 @@ In the example, several arguments can be passed to satisfy your requirements:
|
|||
Inference time: xxxx s
|
||||
-------------------- Prompt --------------------
|
||||
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
|
||||
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
|
||||
-------------------- Output (skip_special_tokens=False) --------------------
|
||||
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
A question that gets to the heart of the 21st century!
|
||||
|
||||
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
|
||||
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as learning, problem-solving, decision
|
||||
```
|
||||
|
|
|
|||
|
|
@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
|
|||
prompt_texts = [f'<|begin_of_text|>']
|
||||
|
||||
if system_prompt != '':
|
||||
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')
|
||||
|
||||
for history_input, history_response in chat_history:
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
|
||||
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')
|
||||
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
|
||||
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
|
||||
return ''.join(prompt_texts)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
@ -69,6 +69,12 @@ if __name__ == '__main__':
|
|||
|
||||
# Load tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
|
||||
terminators = [
|
||||
tokenizer.eos_token_id,
|
||||
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
|
||||
]
|
||||
|
||||
# Generate predicted tokens
|
||||
with torch.inference_mode():
|
||||
|
|
@ -76,11 +82,13 @@ if __name__ == '__main__':
|
|||
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
|
||||
# ipex_llm model needs a warmup, then inference time can be accurate
|
||||
output = model.generate(input_ids,
|
||||
eos_token_id=terminators,
|
||||
max_new_tokens=args.n_predict)
|
||||
|
||||
# start inference
|
||||
st = time.time()
|
||||
output = model.generate(input_ids,
|
||||
eos_token_id=terminators,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
end = time.time()
|
||||
|
|
|
|||
Loading…
Reference in a new issue