From ca5a4b6e3a70622159ac364f94137aff06bae3b0 Mon Sep 17 00:00:00 2001 From: binbin Deng <108676127+plusbang@users.noreply.github.com> Date: Wed, 28 Jun 2023 13:15:50 +0800 Subject: [PATCH] LLM: update bloom and starcoder usage in transformers_int4_pipeline (#8406) --- python/llm/example/transformers/README.md | 62 +++++++++++++------ .../llm/example/transformers/int4_pipeline.py | 7 +-- 2 files changed, 46 insertions(+), 23 deletions(-) diff --git a/python/llm/example/transformers/README.md b/python/llm/example/transformers/README.md index efed0629..172b0bf2 100644 --- a/python/llm/example/transformers/README.md +++ b/python/llm/example/transformers/README.md @@ -33,6 +33,10 @@ arguments info: ## Sample Output for Inference ### Model family LLaMA ```log +-------------------- bigdl-llm based tokenizer -------------------- +Inference time: xxxx s +Output: +[' It’s the acronym for “Central Processing Unit,” and in modern personal computers it means a single microprocessor chip that is used to control various'] -------------------- HuggingFace transformers tokenizer -------------------- Please note that the loading of HuggingFace transformers tokenizer may take some time. @@ -42,10 +46,6 @@ The class this function is called from is 'LlamaTokenizer'. Inference time: xxxx s Output: ["The Central Processing Unit (CPU) is the brains of your computer, and is also known as the microprocessor. It's where all the action"] --------------------- bigdl-llm based tokenizer -------------------- -Inference time: xxxx s -Output: -[' It’s the acronym for “Central Processing Unit,” and in modern personal computers it means a single microprocessor chip that is used to control various'] -------------------- fast forward -------------------- Llama.generate: prefix-match hit @@ -61,16 +61,16 @@ Output: ### Model family GPT-NeoX ```log +-------------------- bigdl-llm based tokenizer -------------------- +Inference time: xxxx s +Output: +[' Central processing unit, also known as processor, is a specialized microchip designed to execute all the instructions of computer programs rapidly and efficiently. Most personal computers have one or'] -------------------- HuggingFace transformers tokenizer -------------------- Please note that the loading of HuggingFace transformers tokenizer may take some time. Inference time: xxxx s Output: [' The Central Processing Unit, or CPU, is the component of a computer that executes all instructions for carrying out different functions. It is the brains of the operation, and'] --------------------- bigdl-llm based tokenizer -------------------- -Inference time: xxxx s -Output: -[' Central processing unit, also known as processor, is a specialized microchip designed to execute all the instructions of computer programs rapidly and efficiently. Most personal computers have one or'] -------------------- fast forward -------------------- Gptneox.generate: prefix-match hit @@ -86,24 +86,50 @@ Output: ### Model family BLOOM ```log +-------------------- bigdl-llm based tokenizer -------------------- +Inference time: xxxx s +Output: +[' Central Processing UnitThe present invention relates to a method of manufacturing an LED device, and more particularly to the manufacture of high-powered LED devices. The inventive'] +-------------------- HuggingFace transformers tokenizer -------------------- +Please note that the loading of HuggingFace transformers tokenizer may take some time. + +Inference time: xxxx s +Output: +[' Central Processing UnitThe present invention relates to a method of manufacturing an LED device, and more particularly to the manufacture of high-powered LED devices. The inventive'] +-------------------- fast forward -------------------- + + inference: mem per token = 24471324 bytes inference: sample time = xxxx ms -inference: evel prompt time = xxxx ms / 5 tokens / xxxx ms per token -inference: predict time = xxxx ms / 3 tokens / xxxx ms per token +inference: evel prompt time = xxxx ms / 1 tokens / xxxx ms per token +inference: predict time = xxxx ms / 4 tokens / xxxx ms per token inference: total time = xxxx ms Inference time (fast forward): xxxx s Output: -{'id': 'cmpl-a0ab2953-e08c-449c-b476-e21ad5bb84b0', 'object': 'text_completion', 'created': 1686557434, 'model': './bigdl_llm_bloom_q4_0.bin', 'choices': [{'text': 'Q: What is CPU? A: central processing unit', 'index': 0, 'logprobs': None, 'finish_reason': None}], 'usage': {'prompt_tokens': None, 'completion_tokens': None, 'total_tokens': None}} +{'id': 'cmpl-4ec29030-f0c4-43d6-80b0-5f5fb76c169d', 'object': 'text_completion', 'created': 1687852341, 'model': './bigdl_llm_bloom_q4_0.bin', 'choices': [{'text': ' the Central Processing Unit', 'index': 0, 'logprobs': None, 'finish_reason': None}], 'usage': {'prompt_tokens': 6, 'completion_tokens': 5, 'total_tokens': 11}} ``` ### Model family StarCoder ```log -bigdl-llm: mem per token = 313912 bytes -bigdl-llm: load time = xxxx ms -bigdl-llm: sample time = xxxx ms -bigdl-llm: predict time = xxxx ms / xxxx ms per token -bigdl-llm: total time = xxxx ms +-------------------- bigdl-llm based tokenizer -------------------- +Inference time: xxxx s +Output: +[' 2.56 GHz, 2.56 GHz, 2.56 GHz, 2.56 GHz, '] +-------------------- HuggingFace transformers tokenizer -------------------- +Please note that the loading of HuggingFace transformers tokenizer may take some time. + +Inference time: xxxx s +Output: +[' 2.56 GHz, 2.56 GHz, 2.56 GHz, 2.56 GHz, '] +-------------------- fast forward -------------------- + + +bigdl-llm: mem per token = 313720 bytes +bigdl-llm: sample time = xxxx ms +bigdl-llm: evel prompt time = xxxx ms +bigdl-llm: predict time = xxxx ms / 31 tokens / xxxx ms per token +bigdl-llm: total time = xxxx ms Inference time (fast forward): xxxx s Output: -{'id': 'cmpl-c2358898-cad0-47f4-af5b-84bb575eb942', 'object': 'text_completion', 'created': 1687864692, 'model': './output_starcoder/bigdl_llm_starcoder_q4_0.bin', 'choices': [{'text': ' %s" % cpu)\n print("Q: What is RAM? A: %s GB" % ram)\n print("Q: What is BAT? A: %s" % bat)\n print("Q: What is the number of CPU\'s? A: %s" % ncpus)', 'index': 0, 'logprobs': None, 'finish_reason': None}], 'usage': {'prompt_tokens': 8, 'completion_tokens': 64, 'total_tokens': 72}} -``` \ No newline at end of file +{'id': 'cmpl-72bc4d13-d8c9-4bcb-b3f4-50a69863d534', 'object': 'text_completion', 'created': 1687852580, 'model': './bigdl_llm_starcoder_q4_0.bin', 'choices': [{'text': ' 0.50, B: 0.25, C: 0.125, D: 0.0625', 'index': 0, 'logprobs': None, 'finish_reason': None}], 'usage': {'prompt_tokens': 8, 'completion_tokens': 32, 'total_tokens': 40}} +``` diff --git a/python/llm/example/transformers/int4_pipeline.py b/python/llm/example/transformers/int4_pipeline.py index a6c271d8..ee5cd10c 100644 --- a/python/llm/example/transformers/int4_pipeline.py +++ b/python/llm/example/transformers/int4_pipeline.py @@ -41,7 +41,7 @@ def load(model_path, model_family, n_threads): def inference(llm, repo_id_or_model_path, model_family, prompt): - if model_family in ['llama', 'gptneox']: + if model_family in ['llama', 'gptneox', 'bloom', 'starcoder']: # ------ Option 1: Use bigdl-llm based tokenizer print('-'*20, ' bigdl-llm based tokenizer ', '-'*20) st = time.time() @@ -79,15 +79,12 @@ def inference(llm, repo_id_or_model_path, model_family, prompt): print(f'Inference time: {time.time()-st} s') print(f'Output:\n{output}') - - if model_family in ['llama', 'gptneox', 'bloom', 'starcoder']: # Option 3: fast forward - # note that currently Bloom/Starcoder family model only supports the fast forward inference method print('-'*20, ' fast forward ', '-'*20) st = time.time() output = llm(prompt, # please note that the prompt here can ONLY be a string - max_tokens=32) + max_tokens=32) print(f'Inference time (fast forward): {time.time()-st} s') print(f'Output:\n{output}')