Fix the not stop issue of llama3 examples (#10860)

* fix not stop issue in GPU/HF-Transformers-AutoModels * fix not stop issue in GPU/PyTorch-Models/Model/llama3 * fix not stop issue in CPU/HF-Transformers-AutoModels/Model/llama3 * fix not stop issue in CPU/PyTorch-Models/Model/llama3 * update the output in readme * update format * add reference * update prompt format * update output format in readme * update example output in readme
2024-04-23 19:10:09 +08:00 · 2024-04-23 19:10:09 +08:00 · 328b1a1de9
commit 328b1a1de9
parent 5c9eb5d0f5
8 changed files with 70 additions and 28 deletions
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama3/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama3/README.md
@ -57,12 +57,16 @@ numactl -C 0-47 -m 0 python ./generate.py
 Inference time: xxxx s
 -------------------- Prompt --------------------
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
+
 What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

+
 -------------------- Output (skip_special_tokens=False) --------------------
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
-What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-A question that gets to the heart of the 21st century!

-Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
+What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as:
+
+1. Learning: AI
 ```
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama3/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama3/generate.py
@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
    prompt_texts = [f'<|begin_of_text|>']

    if system_prompt != '':
-        prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')

    for history_input, history_response in chat_history:
-        prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
-        prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')

-    prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
+    prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
    return ''.join(prompt_texts)

 if __name__ == '__main__':
@ -63,6 +63,12 @@ if __name__ == '__main__':

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+    # here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
+    terminators = [
+        tokenizer.eos_token_id,
+        tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+    ]
    
    # Generate predicted tokens
    with torch.inference_mode():
@ -70,6 +76,7 @@ if __name__ == '__main__':
        input_ids = tokenizer.encode(prompt, return_tensors="pt")
        st = time.time()
        output = model.generate(input_ids,
+                                eos_token_id=terminators,
                                max_new_tokens=args.n_predict)
        end = time.time()
        output_str = tokenizer.decode(output[0], skip_special_tokens=False)
--- a/python/llm/example/CPU/PyTorch-Models/Model/llama3/README.md
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llama3/README.md
@ -57,12 +57,16 @@ In the example, several arguments can be passed to satisfy your requirements:
 Inference time: xxxx s
 -------------------- Prompt --------------------
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
+
 What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

+
 -------------------- Output (skip_special_tokens=False) --------------------
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
-What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-A question that gets to the heart of the 21st century!

-Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
+What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as:
+
+1. Learning: AI
 ```
--- a/python/llm/example/CPU/PyTorch-Models/Model/llama3/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llama3/generate.py
@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
    prompt_texts = [f'<|begin_of_text|>']

    if system_prompt != '':
-        prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')

    for history_input, history_response in chat_history:
-        prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
-        prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')

-    prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
+    prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
    return ''.join(prompt_texts)

 if __name__ == '__main__':
@ -65,6 +65,12 @@ if __name__ == '__main__':

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+    # here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
+    terminators = [
+        tokenizer.eos_token_id,
+        tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+    ]
    
    # Generate predicted tokens
    with torch.inference_mode():
@ -72,6 +78,7 @@ if __name__ == '__main__':
        input_ids = tokenizer.encode(prompt, return_tensors="pt")
        st = time.time()
        output = model.generate(input_ids,
+                                eos_token_id=terminators,
                                max_new_tokens=args.n_predict)
        end = time.time()
        output_str = tokenizer.decode(output[0], skip_special_tokens=False)
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama3/README.md
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama3/README.md
@ -125,12 +125,14 @@ Arguments info:
 Inference time: xxxx s
 -------------------- Prompt --------------------
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
+
 What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

+
 -------------------- Output (skip_special_tokens=False) --------------------
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
-What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-A question that gets to the heart of the 21st century!

-Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
+What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as learning, problem-solving, decision
 ```
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama3/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama3/generate.py
@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
    prompt_texts = [f'<|begin_of_text|>']

    if system_prompt != '':
-        prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')

    for history_input, history_response in chat_history:
-        prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
-        prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')

-    prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
+    prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
    return ''.join(prompt_texts)

 if __name__ == '__main__':
@ -67,17 +67,25 @@ if __name__ == '__main__':
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

+    # here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
+    terminators = [
+        tokenizer.eos_token_id,
+        tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+    ]
+
    # Generate predicted tokens
    with torch.inference_mode():
        prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
        # ipex_llm model needs a warmup, then inference time can be accurate
        output = model.generate(input_ids,
+                                eos_token_id=terminators,
                                max_new_tokens=args.n_predict)

        # start inference
        st = time.time()
        output = model.generate(input_ids,
+                                eos_token_id=terminators,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
        end = time.time()
--- a/python/llm/example/GPU/PyTorch-Models/Model/llama3/README.md
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llama3/README.md
@ -126,12 +126,14 @@ In the example, several arguments can be passed to satisfy your requirements:
 Inference time: xxxx s
 -------------------- Prompt --------------------
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
+
 What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

+
 -------------------- Output (skip_special_tokens=False) --------------------
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
-What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-A question that gets to the heart of the 21st century!

-Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
+What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as learning, problem-solving, decision
 ```
--- a/python/llm/example/GPU/PyTorch-Models/Model/llama3/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llama3/generate.py
@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
    prompt_texts = [f'<|begin_of_text|>']

    if system_prompt != '':
-        prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')

    for history_input, history_response in chat_history:
-        prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
-        prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
+        prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')

-    prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
+    prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
    return ''.join(prompt_texts)

 if __name__ == '__main__':
@ -69,6 +69,12 @@ if __name__ == '__main__':

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+    # here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
+    terminators = [
+        tokenizer.eos_token_id,
+        tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+    ]
    
    # Generate predicted tokens
    with torch.inference_mode():
@ -76,11 +82,13 @@ if __name__ == '__main__':
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
        # ipex_llm model needs a warmup, then inference time can be accurate
        output = model.generate(input_ids,
+                                eos_token_id=terminators,
                                max_new_tokens=args.n_predict)

        # start inference
        st = time.time()
        output = model.generate(input_ids,
+                                eos_token_id=terminators,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
        end = time.time()