Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								24b34b6e46 
								
							 
						 
						
							
							
								
								change xmx condition ( #10000 )  
							
							 
							
							
							
						 
						
							2024-01-25 17:48:11 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ziteng Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								8b08ad408b 
								
							 
						 
						
							
							
								
								Add batch_size in all_in_one ( #9999 )  
							
							 
							
							... 
							
							
							
							Add batch_size in all_in_one, except run_native_int4 
							
						 
						
							2024-01-25 17:43:49 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								093e6f8f73 
								
							 
						 
						
							
							
								
								LLM: Add qwen CPU speculative example ( #9985 )  
							
							 
							
							... 
							
							
							
							* init from gpu
* update for cpu
* update
* update
* fix xpu readme
* update
* update example prompt
* update prompt and add 72b
* update
* update 
							
						 
						
							2024-01-25 17:01:34 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								bf65548d29 
								
							 
						 
						
							
							
								
								Add quantize kv cache support for chaglm2/3 ( #9996 )  
							
							 
							
							
							
						 
						
							2024-01-25 16:55:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Chen, Zhentao 
								
							 
						 
						
							
							
							
							
								
							
							
								86055d76d5 
								
							 
						 
						
							
							
								
								fix optimize_model not working ( #9995 )  
							
							 
							
							
							
						 
						
							2024-01-25 16:39:05 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								9bff84e6fd 
								
							 
						 
						
							
							
								
								LLM: Convert draft_model kv_cache from bf16 to fp32 ( #9964 )  
							
							 
							
							... 
							
							
							
							* convert bf16 to fp32
* update
* change when init
* init first and cut off after
* init and exchange
* update python type
* update
* fix bug
* update
* update 
							
						 
						
							2024-01-25 11:20:27 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									ZehuaCao 
								
							 
						 
						
							
							
							
							
								
							
							
								51aa8b62b2 
								
							 
						 
						
							
							
								
								add gradio_web_ui to llm-serving image ( #9918 )  
							
							 
							
							
							
						 
						
							2024-01-25 11:11:39 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								99ff6cf048 
								
							 
						 
						
							
							
								
								Update gpu spec decoding baichuan2 example dependency ( #9990 )  
							
							 
							
							... 
							
							
							
							* add dependency
* update
* update 
							
						 
						
							2024-01-25 11:05:04 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								27338540c3 
								
							 
						 
						
							
							
								
								Fix repetition_penalty not activated issue ( #9989 )  
							
							 
							
							
							
						 
						
							2024-01-25 10:40:41 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Jason Dai 
								
							 
						 
						
							
							
							
							
								
							
							
								3bc3d0bbcd 
								
							 
						 
						
							
							
								
								Update self-speculative readme ( #9986 )  
							
							 
							
							
							
						 
						
							2024-01-24 22:37:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								b27e5a27b9 
								
							 
						 
						
							
							
								
								Remove the check for meta device in _replace_with_low_bit_linear ( #9984 )  
							
							 
							
							
							
						 
						
							2024-01-24 18:15:39 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								d4f65a6033 
								
							 
						 
						
							
							
								
								LLM: add mistral speculative example ( #9976 )  
							
							 
							
							... 
							
							
							
							* add mistral example
* update 
							
						 
						
							2024-01-24 17:35:15 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								b176cad75a 
								
							 
						 
						
							
							
								
								LLM: Add baichuan2 gpu spec example ( #9973 )  
							
							 
							
							... 
							
							
							
							* add baichuan2 gpu spec example
* update readme & example
* remove print
* fix typo
* meet comments
* revert
* update 
							
						 
						
							2024-01-24 16:40:16 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Jinyi Wan 
								
							 
						 
						
							
							
							
							
								
							
							
								ec2d9de0ea 
								
							 
						 
						
							
							
								
								Fix README.md for solar ( #9957 )  
							
							 
							
							
							
						 
						
							2024-01-24 15:50:54 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Mingyu Wei 
								
							 
						 
						
							
							
							
							
								
							
							
								bc9cff51a8 
								
							 
						 
						
							
							
								
								LLM GPU Example Update for Windows Support ( #9902 )  
							
							 
							
							... 
							
							
							
							* Update README in LLM GPU Examples
* Update reference of Intel GPU
* add cpu_embedding=True in comment
* small fixes
* update GPU/README.md and add explanation for cpu_embedding=True
* address comments
* fix small typos
* add backtick for cpu_embedding=True
* remove extra backtick in the doc
* add period mark
* update readme 
							
						 
						
							2024-01-24 13:42:27 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Chen, Zhentao 
								
							 
						 
						
							
							
							
							
								
							
							
								e0db44dcb6 
								
							 
						 
						
							
							
								
								fix unexpected keyword argument 'device'  ( #9982 )  
							
							 
							
							... 
							
							
							
							* add device for chatglm3 only
* add comment for this change
* fix style
* fix style
* fix style again..
* finally fixed style 
							
						 
						
							2024-01-24 13:20:46 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Lilac09 
								
							 
						 
						
							
							
							
							
								
							
							
								de27ddd81a 
								
							 
						 
						
							
							
								
								Update Dockerfile ( #9981 )  
							
							 
							
							
							
						 
						
							2024-01-24 11:10:06 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Lilac09 
								
							 
						 
						
							
							
							
							
								
							
							
								a2718038f7 
								
							 
						 
						
							
							
								
								Fix qwen model adapter in docker ( #9969 )  
							
							 
							
							... 
							
							
							
							* fix qwen in docker
* add patch for model_adapter.py in fastchat
* add patch for model_adapter.py in fastchat 
							
						 
						
							2024-01-24 11:01:29 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Mingyu Wei 
								
							 
						 
						
							
							
							
							
								
							
							
								50a851e3b3 
								
							 
						 
						
							
							
								
								LLM: separate arc ut for disable XMX ( #9953 )  
							
							 
							
							... 
							
							
							
							* separate test_optimize_model api with disabled xmx
* delete test_optimize_model in test_transformers_api.py
* set env variable in .sh/ put back test_optimize_model
* unset env variable
* remove env setting in .py
* address errors in action
* remove import ipex
* lower tolerance 
							
						 
						
							2024-01-23 19:04:47 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								8d28aa8e2b 
								
							 
						 
						
							
							
								
								[LLM] Fix the model.device problem when cpu_embedding=True ( #9971 )  
							
							 
							
							... 
							
							
							
							* Overwrite the device attribute for CPUPinnedParam
* Expose cpu_embedding=True for Linux users
* Fix python style 
							
						 
						
							2024-01-23 18:51:11 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								f82782cd3b 
								
							 
						 
						
							
							
								
								fix starcoder ( #9975 )  
							
							 
							
							
							
						 
						
							2024-01-23 17:24:53 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									WeiguangHan 
								
							 
						 
						
							
							
							
							
								
							
							
								be5836bee1 
								
							 
						 
						
							
							
								
								LLM: fix outlier value ( #9945 )  
							
							 
							
							... 
							
							
							
							* fix outlier value
* small fix 
							
						 
						
							2024-01-23 17:04:13 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								2c8a9aaf0d 
								
							 
						 
						
							
							
								
								fix qwen causal mask when quantize_kv_cache=True ( #9968 )  
							
							 
							
							
							
						 
						
							2024-01-23 16:34:05 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								5aa4b32c1b 
								
							 
						 
						
							
							
								
								LLM: Add qwen spec gpu example ( #9965 )  
							
							 
							
							... 
							
							
							
							* add qwen spec gpu example
* update readme
---------
Co-authored-by: rnwang04 <ruonan1.wang@intel.com> 
							
						 
						
							2024-01-23 15:59:43 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								36c665667d 
								
							 
						 
						
							
							
								
								Add logits processor & qwen eos stop in speculative decoding ( #9963 )  
							
							 
							
							... 
							
							
							
							* add logits processor & qwen eos
* fix style
* fix
* fix
* fix style
* fix style
* support transformers 4.31
* fix style
* fix style
---------
Co-authored-by: rnwang04 <ruonan1.wang@intel.com> 
							
						 
						
							2024-01-23 15:57:28 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								60b35db1f1 
								
							 
						 
						
							
							
								
								LLM: add chatglm3 speculative decoding example ( #9966 )  
							
							 
							
							... 
							
							
							
							* add chatglm3 example
* update
* fix 
							
						 
						
							2024-01-23 15:54:12 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								da4687c917 
								
							 
						 
						
							
							
								
								fix fp16 ( #9970 )  
							
							 
							
							
							
						 
						
							2024-01-23 15:53:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Lilac09 
								
							 
						 
						
							
							
							
							
								
							
							
								052962dfa5 
								
							 
						 
						
							
							
								
								Using original fastchat and add bigdl worker in docker image ( #9967 )  
							
							 
							
							... 
							
							
							
							* add vllm worker
* add options in entrypoint 
							
						 
						
							2024-01-23 14:17:05 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Chen, Zhentao 
								
							 
						 
						
							
							
							
							
								
							
							
								301425e377 
								
							 
						 
						
							
							
								
								harness tests on pvc multiple xpus ( #9908 )  
							
							 
							
							... 
							
							
							
							* add run_multi_llb.py
* update readme
* add job hint 
							
						 
						
							2024-01-23 13:20:37 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								27b19106f3 
								
							 
						 
						
							
							
								
								LLM: add readme for speculative decoding gpu examples ( #9961 )  
							
							 
							
							... 
							
							
							
							* add readme
* add readme
* meet code review 
							
						 
						
							2024-01-23 12:54:19 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Chen, Zhentao 
								
							 
						 
						
							
							
							
							
								
							
							
								39219b7e9a 
								
							 
						 
						
							
							
								
								add default device meta  when lcmu enabled ( #9941 )  
							
							 
							
							
							
						 
						
							2024-01-23 11:00:49 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								dacf680294 
								
							 
						 
						
							
							
								
								add fused rotary pos emb for qwen ( #9956 )  
							
							 
							
							... 
							
							
							
							* add fused rotary pos emb for qwen
* update 
							
						 
						
							2024-01-23 10:37:56 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								7b1d9ad7c0 
								
							 
						 
						
							
							
								
								LLM: limit esimd sdp usage for k_len < 8 ( #9959 )  
							
							 
							
							... 
							
							
							
							* update
* fix 
							
						 
						
							2024-01-23 09:28:23 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								3e601f9a5d 
								
							 
						 
						
							
							
								
								LLM: Support speculative decoding in bigdl-llm ( #9951 )  
							
							 
							
							... 
							
							
							
							* first commit
* fix error, add llama example
* hidden print
* update api usage
* change to api v3
* update
* meet code review
* meet code review, fix style
* add reference, fix style
* fix style
* fix first token time 
							
						 
						
							2024-01-22 19:14:56 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Jinyi Wan 
								
							 
						 
						
							
							
							
							
								
							
							
								6341c498b3 
								
							 
						 
						
							
							
								
								Fix the links of BlueLM and SOLAR ( #9954 )  
							
							 
							
							
							
						 
						
							2024-01-22 15:58:10 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cheen Hau, 俊豪 
								
							 
						 
						
							
							
							
							
								
							
							
								947b1e27b7 
								
							 
						 
						
							
							
								
								Add readme for Whisper Test ( #9944 )  
							
							 
							
							... 
							
							
							
							* Fix local data path
* Remove non-essential files
* Add readme
* Minor fixes to script
* Bugfix, refactor
* Add references to original source. Bugfixes.
* Reviewer comments
* Properly print and explain output
* Move files to dev/benchmark
* Fixes 
							
						 
						
							2024-01-22 15:11:33 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								6fb3f40f7e 
								
							 
						 
						
							
							
								
								fix error for benchmark_util.py running on cpu ( #9949 )  
							
							 
							
							
							
						 
						
							2024-01-22 10:14:40 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								fb91c97fe8 
								
							 
						 
						
							
							
								
								support for Baichuan/Baichuan2 13B Chat running speculative decoding ( #9921 )  
							
							 
							
							... 
							
							
							
							* support for Baichuan/Baichuan2 13B Chat running speculative decoding
* fix stype 
							
						 
						
							2024-01-22 09:11:44 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								97f0cd8975 
								
							 
						 
						
							
							
								
								optimize Decilm 7b ( #9922 )  
							
							 
							
							... 
							
							
							
							* optimize deci
* update
* decilm attension forward 
							
						 
						
							2024-01-19 17:31:13 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								bcaeb05272 
								
							 
						 
						
							
							
								
								Update optimize qwen ( #9943 )  
							
							 
							
							... 
							
							
							
							* update for n tokens input
* fix dtype
* update 
							
						 
						
							2024-01-19 16:54:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
							
							
								
							
							
								db8e90796a 
								
							 
						 
						
							
							
								
								LLM: add avg token latency information and benchmark guide of autotp ( #9940 )  
							
							 
							
							
							
						 
						
							2024-01-19 15:09:57 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								bf37b3a670 
								
							 
						 
						
							
							
								
								LLM: optimize CPU speculative decoding of chatglm3 ( #9928 )  
							
							 
							
							... 
							
							
							
							* update
* fix style
* meet code review 
							
						 
						
							2024-01-19 14:10:22 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Shaojun Liu 
								
							 
						 
						
							
							
							
							
								
							
							
								967714bac8 
								
							 
						 
						
							
							
								
								gguf memory optimization for mixtral ( #9939 )  
							
							 
							
							
							
						 
						
							2024-01-19 11:13:15 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								610b5226be 
								
							 
						 
						
							
							
								
								move reserved memory to benchmark_utils.py ( #9907 )  
							
							 
							
							... 
							
							
							
							* move reserved memory to benchmark_utils.py
* meet code review 
							
						 
						
							2024-01-19 09:44:30 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Lilac09 
								
							 
						 
						
							
							
							
							
								
							
							
								7032a2ad73 
								
							 
						 
						
							
							
								
								Optimize gguf load memory for mistral ( #9923 )  
							
							 
							
							... 
							
							
							
							* optimize gguf load for mistral
* fix output of gguf mistral
* reset 
							
						 
						
							2024-01-19 09:14:39 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Shaojun Liu 
								
							 
						 
						
							
							
							
							
								
							
							
								9a46f019d7 
								
							 
						 
						
							
							
								
								gguf memory optimization for baichuan ( #9937 )  
							
							 
							
							
							
						 
						
							2024-01-19 09:11:02 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
							
							
								
							
							
								2e1448f08e 
								
							 
						 
						
							
							
								
								[Serving] Add vllm_worker to fastchat serving framework ( #9934 )  
							
							 
							
							... 
							
							
							
							* add worker
* finish
* finish
* add license
* add more comments 
							
						 
						
							2024-01-18 21:33:36 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Chen, Zhentao 
								
							 
						 
						
							
							
							
							
								
							
							
								a8c866c32b 
								
							 
						 
						
							
							
								
								add ppl benchmark ( #9914 )  
							
							 
							
							... 
							
							
							
							* add ppl benchmark
* add license
* add readme
* add dataset argument
* add dataset usage
* fixed low bit args
* correct result
* fix terminal display
* fix ppl update
* enable fp16 fp32 bf16
* format the desc
* fix model_kwargs
* add more readme 
							
						 
						
							2024-01-18 17:54:28 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									WeiguangHan 
								
							 
						 
						
							
							
							
							
								
							
							
								100e0a87e5 
								
							 
						 
						
							
							
								
								LLM: add compressed chatglm3 model ( #9892 )  
							
							 
							
							... 
							
							
							
							* LLM: add compressed chatglm3 model
* small fix
* revert github action 
							
						 
						
							2024-01-18 17:48:15 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								9e2ac5291b 
								
							 
						 
						
							
							
								
								Add rwkv v4 back for igpu perf test 32-512 ( #9938 )  
							
							 
							
							
							
						 
						
							2024-01-18 17:15:28 +08:00