Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								a465111cf4 
								
							 
						 
						
							
							
								
								Update README.md ( #11003 )  
							
							 
							
							
							
						 
						
							2024-05-13 16:44:48 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								74997a3ed1 
								
							 
						 
						
							
							
								
								Adding load_low_bit interface for ipex_llm_worker ( #11000 )  
							
							 
							
							... 
							
							
							
							* initial implementation, need tests
* fix
* fix baichuan issue
* fix typo 
							
						 
						
							2024-05-13 15:30:19 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								1b3c7a6928 
								
							 
						 
						
							
							
								
								remove phi3 empty cache ( #10997 )  
							
							 
							
							
							
						 
						
							2024-05-13 14:09:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								ad96f32ce0 
								
							 
						 
						
							
							
								
								optimize phi3 1st token performance ( #10981 )  
							
							 
							
							
							
						 
						
							2024-05-10 17:33:46 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cengguang Zhang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								cfed76b2ed 
								
							 
						 
						
							
							
								
								LLM: add long-context support for Qwen1.5-7B/Baichuan2-7B/Mistral-7B. ( #10937 )  
							
							 
							
							... 
							
							
							
							* LLM: add split tensor support for baichuan2-7b and qwen1.5-7b.
* fix style.
* fix style.
* fix style.
* add support for mistral and fix condition threshold.
* fix  style.
* fix comments. 
							
						 
						
							2024-05-10 16:40:15 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Kai Huang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								a6342cc068 
								
							 
						 
						
							
							
								
								Empty cache after phi first attention to support 4k input ( #10972 )  
							
							 
							
							... 
							
							
							
							* empty cache
* fix style 
							
						 
						
							2024-05-09 19:50:04 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								e753125880 
								
							 
						 
						
							
							
								
								use fp16_sdp when head_dim=96 ( #10976 )  
							
							 
							
							
							
						 
						
							2024-05-09 17:02:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								697ca79eca 
								
							 
						 
						
							
							
								
								use quantize kv and sdp in phi3-mini ( #10973 )  
							
							 
							
							
							
						 
						
							2024-05-09 15:16:18 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								3209d6b057 
								
							 
						 
						
							
							
								
								Fix spculative llama3 no stop error ( #10963 )  
							
							 
							
							... 
							
							
							
							* fix normal
* add eos_tokens_id on sp and add list if
* update
* no none 
							
						 
						
							2024-05-08 17:09:47 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								2ebec0395c 
								
							 
						 
						
							
							
								
								optimize phi-3-mini-128 ( #10959 )  
							
							 
							
							
							
						 
						
							2024-05-08 16:33:17 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zhao Changmin 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								0d6e12036f 
								
							 
						 
						
							
							
								
								Disable fast_init_ in load_low_bit ( #10945 )  
							
							 
							
							... 
							
							
							
							* fast_init_ disable 
							
						 
						
							2024-05-08 10:46:19 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								c801c37bc6 
								
							 
						 
						
							
							
								
								optimize phi3 again: use quantize kv if possible ( #10953 )  
							
							 
							
							
							
						 
						
							2024-05-07 17:26:19 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								aa2fa9fde1 
								
							 
						 
						
							
							
								
								optimize phi3 again: use sdp if possible ( #10951 )  
							
							 
							
							
							
						 
						
							2024-05-07 15:53:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Qiyuan Gong 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								d7ca5d935b 
								
							 
						 
						
							
							
								
								Upgrade Peft version to 0.10.0 for LLM finetune ( #10886 )  
							
							 
							
							... 
							
							
							
							* Upgrade Peft version to 0.10.0
* Upgrade Peft version in ARC unit test and HF-Peft example. 
							
						 
						
							2024-05-07 15:09:14 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								191b184341 
								
							 
						 
						
							
							
								
								LLM: Optimize cohere model ( #10878 )  
							
							 
							
							... 
							
							
							
							* use mlp and rms
* optimize kv_cache
* add fuse qkv
* add flash attention and fp16 sdp
* error fp8 sdp
* fix optimized
* fix style
* update
* add for pp 
							
						 
						
							2024-05-07 10:19:50 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								49ab5a2b0e 
								
							 
						 
						
							
							
								
								Add embeddings ( #10931 )  
							
							 
							
							
							
						 
						
							2024-05-07 09:07:02 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								0e0bd309e2 
								
							 
						 
						
							
							
								
								LLM: Enable Speculative on Fastchat ( #10909 )  
							
							 
							
							... 
							
							
							
							* init
* enable streamer
* update
* update
* remove deprecated
* update
* update
* add gpu example 
							
						 
						
							2024-05-06 10:06:20 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cengguang Zhang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								75dbf240ec 
								
							 
						 
						
							
							
								
								LLM: update split tensor conditions. ( #10872 )  
							
							 
							
							... 
							
							
							
							* LLM: update split tensor condition.
* add cond for split tensor.
* update priority of env.
* fix style.
* update env name. 
							
						 
						
							2024-04-30 17:07:21 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								2c64754eb0 
								
							 
						 
						
							
							
								
								Add vLLM to ipex-llm serving image ( #10807 )  
							
							 
							
							... 
							
							
							
							* add vllm
* done
* doc work
* fix done
* temp
* add docs
* format
* add start-fastchat-service.sh
* fix 
							
						 
						
							2024-04-29 17:25:42 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								d884c62dc4 
								
							 
						 
						
							
							
								
								remove new_layout parameter ( #10906 )  
							
							 
							
							
							
						 
						
							2024-04-29 10:31:50 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								fbcd7bc737 
								
							 
						 
						
							
							
								
								Fix Loader issue with dtype fp16 ( #10907 )  
							
							 
							
							
							
						 
						
							2024-04-29 10:16:02 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								c9fac8c26b 
								
							 
						 
						
							
							
								
								Fix sdp logic ( #10896 )  
							
							 
							
							... 
							
							
							
							* fix
* fix 
							
						 
						
							2024-04-28 22:02:14 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								015d07a58f 
								
							 
						 
						
							
							
								
								Fix lookahead sample error & add update strategy ( #10894 )  
							
							 
							
							... 
							
							
							
							* Fix sample error & add update strategy
* add mtl config
* fix style
* remove print 
							
						 
						
							2024-04-28 17:21:00 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cengguang Zhang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								9752ffe979 
								
							 
						 
						
							
							
								
								LLM: update split qkv native sdp. ( #10895 )  
							
							 
							
							... 
							
							
							
							* LLM: update split qkv native sdp.
* fix typo. 
							
						 
						
							2024-04-26 18:47:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								990535b1cf 
								
							 
						 
						
							
							
								
								Add tensor parallel for vLLM ( #10879 )  
							
							 
							
							... 
							
							
							
							* initial
* test initial tp
* initial sup
* fix format
* fix
* fix 
							
						 
						
							2024-04-26 17:10:49 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								46ba962168 
								
							 
						 
						
							
							
								
								use new quantize kv ( #10888 )  
							
							 
							
							
							
						 
						
							2024-04-26 14:42:17 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								3e8ed54270 
								
							 
						 
						
							
							
								
								LLM: Fix bigdl_ipex_int8 warning ( #10890 )  
							
							 
							
							
							
						 
						
							2024-04-26 11:18:44 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								8811f268ff 
								
							 
						 
						
							
							
								
								Use new fp16 sdp in Qwen and modify the constraint ( #10882 )  
							
							 
							
							
							
						 
						
							2024-04-25 19:23:37 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yang Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								1ce8d7bcd9 
								
							 
						 
						
							
							
								
								Support the desc_act feature in GPTQ model ( #10851 )  
							
							 
							
							... 
							
							
							
							* support act_order
* update versions
* fix style
* fix bug
* clean up 
							
						 
						
							2024-04-24 10:17:13 -07:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								dc27b3bc35 
								
							 
						 
						
							
							
								
								Use sdp when rest token seq_len > 1 in llama & mistral (for lookup & spec) ( #10790 )  
							
							 
							
							... 
							
							
							
							* update sdp condition
* update
* fix
* update & test llama
* mistral
* fix style
* update
* fix style
* remove pvc constrain
* update ds on arc
* fix style 
							
						 
						
							2024-04-24 17:24:01 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								c9feffff9a 
								
							 
						 
						
							
							
								
								LLM: support Qwen1.5-MoE-A2.7B-Chat pipeline parallel inference ( #10864 )  
							
							 
							
							
							
						 
						
							2024-04-24 16:02:27 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								2d210817ff 
								
							 
						 
						
							
							
								
								add phi3 optimization ( #10871 )  
							
							 
							
							
							
						 
						
							2024-04-24 15:17:40 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cengguang Zhang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								763413b7e1 
								
							 
						 
						
							
							
								
								LLM: support llama split tensor for long context in transformers>=4.36. ( #10844 )  
							
							 
							
							... 
							
							
							
							* LLm: support llama split tensor for long context in transformers>=4.36.
* fix dtype.
* fix style.
* fix style.
* fix style.
* fix style.
* fix dtype.
* fix style. 
							
						 
						
							2024-04-23 16:13:25 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									ZehuaCao 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								92ea54b512 
								
							 
						 
						
							
							
								
								Fix speculative decoding bug ( #10855 )  
							
							 
							
							
							
						 
						
							2024-04-23 14:28:31 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								18c032652d 
								
							 
						 
						
							
							
								
								LLM: Add mixtral speculative CPU example ( #10830 )  
							
							 
							
							... 
							
							
							
							* init mixtral sp example
* use different prompt_format
* update output
* update 
							
						 
						
							2024-04-23 10:05:51 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								fe5a082b84 
								
							 
						 
						
							
							
								
								add phi-2 optimization ( #10843 )  
							
							 
							
							
							
						 
						
							2024-04-22 18:56:47 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								47bd5f504c 
								
							 
						 
						
							
							
								
								[vLLM]Remove vllm-v1, refactor v2 ( #10842 )  
							
							 
							
							... 
							
							
							
							* remove vllm-v1
* fix format 
							
						 
						
							2024-04-22 17:51:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								23c6a52fb0 
								
							 
						 
						
							
							
								
								LLM: Fix ipex torchscript=True error ( #10832 )  
							
							 
							
							... 
							
							
							
							* remove
* update
* remove torchscript 
							
						 
						
							2024-04-22 15:53:09 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								3daad242b8 
								
							 
						 
						
							
							
								
								Fix  No module named 'transformers.cache_utils' with transformers < 4.36 ( #10835 )  
							
							 
							
							... 
							
							
							
							* update sdp condition
* update
* fix
* fix 431 error
* revert sdp & style fix
* fix
* meet comments 
							
						 
						
							2024-04-22 14:05:50 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								caf75beef8 
								
							 
						 
						
							
							
								
								Disable sdpa ( #10814 )  
							
							 
							
							
							
						 
						
							2024-04-19 17:33:18 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								57edf2033c 
								
							 
						 
						
							
							
								
								fix lookahead with transformers >= 4.36 ( #10808 )  
							
							 
							
							
							
						 
						
							2024-04-19 16:24:56 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ovo233 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								1a885020ee 
								
							 
						 
						
							
							
								
								Updated importing of top_k_top_p_filtering for transformers>=4.39.0 ( #10794 )  
							
							 
							
							... 
							
							
							
							* In transformers>=4.39.0, the top_k_top_p_filtering function has been deprecated and moved to the hugging face package trl. Thus, for versions >= 4.39.0, import this function from trl. 
							
						 
						
							2024-04-19 15:34:39 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								08458b4f74 
								
							 
						 
						
							
							
								
								remove rms norm copy ( #10793 )  
							
							 
							
							
							
						 
						
							2024-04-19 13:57:48 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								754b0ffecf 
								
							 
						 
						
							
							
								
								Fix pvc llama ( #10798 )  
							
							 
							
							... 
							
							
							
							* ifx
* update 
							
						 
						
							2024-04-18 10:44:57 -07:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								439c834ed3 
								
							 
						 
						
							
							
								
								LLM: add mixed precision for lm_head ( #10795 )  
							
							 
							
							... 
							
							
							
							* add mixed_quantization
* meet code review
* update
* fix style
* meet review 
							
						 
						
							2024-04-18 19:11:31 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								8796401b08 
								
							 
						 
						
							
							
								
								Support q4k in ipex-llm ( #10796 )  
							
							 
							
							... 
							
							
							
							* support q4k
* update 
							
						 
						
							2024-04-18 18:55:28 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								0e8aac19e3 
								
							 
						 
						
							
							
								
								add q6k precision in ipex-llm ( #10792 )  
							
							 
							
							... 
							
							
							
							* add q6k
* add initial 16k
* update
* fix style 
							
						 
						
							2024-04-18 16:52:09 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								14ca42a048 
								
							 
						 
						
							
							
								
								LLM:Fix moe indexs error on cpu ( #10791 )  
							
							 
							
							
							
						 
						
							2024-04-18 15:56:52 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								cbe7b5753f 
								
							 
						 
						
							
							
								
								Add vLLM[xpu] related code ( #10779 )  
							
							 
							
							... 
							
							
							
							* Add ipex-llm side change
* add runable offline_inference
* refactor to call vllm2
* Verified async server
* add new v2 example
* add README
* fix
* change dir
* refactor readme.md
* add experimental
* fix 
							
						 
						
							2024-04-18 15:29:20 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								209c3501e6 
								
							 
						 
						
							
							
								
								LLM: Optimize qwen1.5 moe model ( #10706 )  
							
							 
							
							... 
							
							
							
							* update moe block
* fix style
* enable optmize MLP
* enabel kv_cache
* enable fuse rope
* enable fused qkv
* enable flash_attention
* error sdp quantize
* use old api
* use fuse
* use xetla
* fix python style
* update moe_blocks num
* fix output error
* add cpu sdpa
* update
* update
* update 
							
						 
						
							2024-04-18 14:54:05 +08:00