Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								0193f29411 
								
							 
						 
						
							
							
								
								LLM : Enable  gguf float16 and Yuan2 model ( #10372 )  
							
							 
							
							... 
							
							
							
							* enable float16
* add yun files
* enable yun
* enable set low_bit on yuan2
* update
* update license
* update generate
* update readme
* update python style
* update 
							
						 
						
							2024-03-13 10:19:18 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								f5d65203c0 
								
							 
						 
						
							
							
								
								First token lm_head optimization ( #10318 )  
							
							 
							
							... 
							
							
							
							* add lm head linear
* update
* address comments and fix style
* address comment 
							
						 
						
							2024-03-13 10:11:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								28c4a8cf5c 
								
							 
						 
						
							
							
								
								Qwen fused qkv ( #10368 )  
							
							 
							
							... 
							
							
							
							* fused qkv + rope for qwen
* quantized kv cache
* fix
* update qwen
* fixed quantized qkv
* fix
* meet code review
* update split
* convert.py
* extend when no enough kv
* fix 
							
						 
						
							2024-03-12 17:39:00 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								741c2bf1df 
								
							 
						 
						
							
							
								
								use new rms norm ( #10384 )  
							
							 
							
							
							
						 
						
							2024-03-12 17:29:51 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xiangyu Tian 
								
							 
						 
						
							
							
							
							
								
							
							
								0ded0b4b13 
								
							 
						 
						
							
							
								
								LLM: Enable BigDL IPEX optimization for int4 ( #10319 )  
							
							 
							
							... 
							
							
							
							Enable BigDL IPEX optimization for int4 
							
						 
						
							2024-03-12 17:08:50 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zhao Changmin 
								
							 
						 
						
							
							
							
							
								
							
							
								df2b84f7de 
								
							 
						 
						
							
							
								
								Enable kv cache on arc batch ( #10308 )  
							
							 
							
							
							
						 
						
							2024-03-12 16:46:04 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
							
							
								
							
							
								cc4148636d 
								
							 
						 
						
							
							
								
								[FastChat-integration] Add initial implementation for loader ( #10323 )  
							
							 
							
							... 
							
							
							
							* add initial implementation for loader
* add test method for model_loader
* data
* Refine 
							
						 
						
							2024-03-12 10:54:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
							
							
								
							
							
								dbcfc5c2fa 
								
							 
						 
						
							
							
								
								LLM: fix error of 'AI-ModelScope/phi-2' hosted by ModelScope hub ( #10364 )  
							
							 
							
							
							
						 
						
							2024-03-11 16:19:17 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Chen, Zhentao 
								
							 
						 
						
							
							
							
							
								
							
							
								a425eaabfc 
								
							 
						 
						
							
							
								
								fix from_pretrained when device_map=None ( #10361 )  
							
							 
							
							... 
							
							
							
							* pr trigger
* fix error when device_map=None
* fix device_map=None 
							
						 
						
							2024-03-11 16:06:12 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								d7b765fd3f 
								
							 
						 
						
							
							
								
								serving xpu memory opt ( #10358 )  
							
							 
							
							
							
						 
						
							2024-03-11 15:21:22 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								be29833b2b 
								
							 
						 
						
							
							
								
								LLM: fix qwen2 ( #10356 )  
							
							 
							
							
							
						 
						
							2024-03-11 09:29:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zhicun 
								
							 
						 
						
							
							
							
							
								
							
							
								9026c08633 
								
							 
						 
						
							
							
								
								Fix llamaindex AutoTokenizer bug ( #10345 )  
							
							 
							
							... 
							
							
							
							* fix tokenizer
* fix AutoTokenizer bug
* modify code style 
							
						 
						
							2024-03-08 16:24:50 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Keyan (Kyrie) Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								7a621a4db0 
								
							 
						 
						
							
							
								
								Fix device_map bug by raise an error when using device_map=xpu ( #10340 )  
							
							 
							
							... 
							
							
							
							* Fix device_map bug by raise an error when using device_map=xpu
* Fix sync error
* Fix python style
* Use invalidInputError instead of invalidOperationError 
							
						 
						
							2024-03-08 13:38:52 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								1ac193ba02 
								
							 
						 
						
							
							
								
								add rope theta argument ( #10343 )  
							
							 
							
							
							
						 
						
							2024-03-07 17:27:19 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cengguang Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								496d18ab6d 
								
							 
						 
						
							
							
								
								LLM: add quantize kv cache support for baichuan 7b and 13b. ( #10330 )  
							
							 
							
							... 
							
							
							
							* add quantize kv cache for baichuan 7b and 13b.
* fix typo.
* fix.
* fix style.
* fix style. 
							
						 
						
							2024-03-07 16:17:38 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								9ea499ca68 
								
							 
						 
						
							
							
								
								Optimize speculative decoding PVC memory usage ( #10329 )  
							
							 
							
							... 
							
							
							
							* optimize memory
* update
* update
* update
* support other models
* update
* fix style 
							
						 
						
							2024-03-06 09:54:21 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									dingbaorong 
								
							 
						 
						
							
							
							
							
								
							
							
								cc796848ea 
								
							 
						 
						
							
							
								
								fix typos ( #10274 )  
							
							 
							
							... 
							
							
							
							Co-authored-by: Ariadne <wyn2000330@126.com> 
							
						 
						
							2024-03-05 18:38:22 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								0011ff9f64 
								
							 
						 
						
							
							
								
								optimize bge large performance ( #10324 )  
							
							 
							
							
							
						 
						
							2024-03-05 17:06:03 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cengguang Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								30d009bca7 
								
							 
						 
						
							
							
								
								LLM: support quantized kv cache for Mistral in transformers >=4.36.0 ( #10326 )  
							
							 
							
							... 
							
							
							
							* support quantize kv for mistral in transformers 4.36
* update mistral support.
* fix style. 
							
						 
						
							2024-03-05 16:23:50 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									dingbaorong 
								
							 
						 
						
							
							
							
							
								
							
							
								1e6f0c6f1a 
								
							 
						 
						
							
							
								
								Add llamaindex gpu example ( #10314 )  
							
							 
							
							... 
							
							
							
							* add llamaindex example
* fix core dump
* refine readme
* add trouble shooting
* refine readme
---------
Co-authored-by: Ariadne <wyn2000330@126.com> 
							
						 
						
							2024-03-05 13:36:00 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									dingbaorong 
								
							 
						 
						
							
							
							
							
								
							
							
								fc7f10cd12 
								
							 
						 
						
							
							
								
								add langchain gpu example ( #10277 )  
							
							 
							
							... 
							
							
							
							* first draft
* fix
* add readme for transformer_int4_gpu
* fix doc
* check device_map
* add arc ut test
* fix ut test
* fix langchain ut
* Refine README
* fix gpu mem too high
* fix ut test
---------
Co-authored-by: Ariadne <wyn2000330@126.com> 
							
						 
						
							2024-03-05 13:33:57 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cengguang Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								ab9fc2485f 
								
							 
						 
						
							
							
								
								LLM: add quantize kv support for llama transformer 4.36 ( #10298 )  
							
							 
							
							... 
							
							
							
							* add quantize kv support for llama transformer 4.36
* fix style.
* fix style. 
							
						 
						
							2024-03-04 10:33:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									SONG Ge 
								
							 
						 
						
							
							
							
							
								
							
							
								0ab40917fb 
								
							 
						 
						
							
							
								
								[LLM] Split merged_qk to separated q/k linear ( #10299 )  
							
							 
							
							... 
							
							
							
							* modify merge_qk_linear to separated q/k linear
* update 
							
						 
						
							2024-03-01 16:48:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yang Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								f4d7dbcde2 
								
							 
						 
						
							
							
								
								use fused qkv forward in qwen2 ( #10185 )  
							
							 
							
							... 
							
							
							
							* use fused qkv forward in qwen2
* support both
* fix style
* fix rope
* remove pring
* fix style
* clean up 
							
						 
						
							2024-03-01 16:46:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								beb9433cec 
								
							 
						 
						
							
							
								
								LLM: Reduce speculative _ipex_optimize_model memory use ( #10281 )  
							
							 
							
							... 
							
							
							
							* use tpp
* update ipex 
							
						 
						
							2024-03-01 13:48:23 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								f0ff0eebe1 
								
							 
						 
						
							
							
								
								[LLM] Support quantize kv cache for Baichuan2 7B ( #10280 )  
							
							 
							
							... 
							
							
							
							* Add quatized kv cache framework for Baichuan2 7B
* Support quantize kv cache for baichuan2
* Small fix
* Fix python style 
							
						 
						
							2024-03-01 13:35:42 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									SONG Ge 
								
							 
						 
						
							
							
							
							
								
							
							
								273de341d7 
								
							 
						 
						
							
							
								
								hot-fix silu error import ( #10292 )  
							
							 
							
							
							
						 
						
							2024-03-01 10:11:37 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								232273a1b5 
								
							 
						 
						
							
							
								
								Enable Gemma fused mlp + Gelu ( #10276 )  
							
							 
							
							... 
							
							
							
							* update llama mlp forward
* add all
* fix style check
* split
* update
* update
* update
* fix style 
							
						 
						
							2024-02-29 16:53:24 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
							
							
								
							
							
								2d930bdca8 
								
							 
						 
						
							
							
								
								Add vLLM bf16 support ( #10278 )  
							
							 
							
							... 
							
							
							
							* add argument load_in_low_bit
* add docs
* modify gpu doc
* done
---------
Co-authored-by: ivy-lv11 <lvzc@lamda.nju.edu.cn> 
							
						 
						
							2024-02-29 16:33:42 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									SONG Ge 
								
							 
						 
						
							
							
							
							
								
							
							
								13b0bc9075 
								
							 
						 
						
							
							
								
								[LLM] Add quantize_kv optimization for yuan2 model ( #10243 )  
							
							 
							
							... 
							
							
							
							* add initial quantize_kv support for yuan2 model
* fix yuan2 quantize_kv generation
* apply fp16 conv layer optimizations
* disable mlp for quantize_kv 
							
						 
						
							2024-02-29 16:33:26 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zhicun 
								
							 
						 
						
							
							
							
							
								
							
							
								4e6cc424f1 
								
							 
						 
						
							
							
								
								Add LlamaIndex RAG ( #10263 )  
							
							 
							
							... 
							
							
							
							* run demo
* format code
* add llamaindex
* add custom LLM with bigdl
* update
* add readme
* begin ut
* add unit test
* add license
* add license
* revised
* update
* modify docs
* remove data folder
* update
* modify prompt
* fixed
* fixed
* fixed 
							
						 
						
							2024-02-29 15:21:19 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								a9fd20b6ba 
								
							 
						 
						
							
							
								
								LLM: Update qkv fusion for GGUF-IQ2 ( #10271 )  
							
							 
							
							... 
							
							
							
							* first commit
* update mistral
* fix transformers==4.36.0
* fix
* disable qk for mixtral now
* fix style 
							
						 
						
							2024-02-29 12:49:53 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								4b08bc1417 
								
							 
						 
						
							
							
								
								LLM: relax batch check of flash atttention by double check attention mask ( #10270 )  
							
							 
							
							... 
							
							
							
							* relax batch check
* fix
* fix style 
							
						 
						
							2024-02-29 09:39:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								07f36fbfcc 
								
							 
						 
						
							
							
								
								Fix gptj failed to extend ( #10269 )  
							
							 
							
							
							
						 
						
							2024-02-29 09:39:27 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								cccb02dad1 
								
							 
						 
						
							
							
								
								fix baichuan2 13b 2k input ( #10267 )  
							
							 
							
							
							
						 
						
							2024-02-28 17:20:20 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								7244fd1ba5 
								
							 
						 
						
							
							
								
								Fix Arc StarCoder wrong query_shape when input is long ( #10268 )  
							
							 
							
							... 
							
							
							
							* Fix Arc StarCoder wrong query_shape when input is long
* Update gptbigcode.py 
							
						 
						
							2024-02-28 17:07:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cengguang Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								a4de3095f3 
								
							 
						 
						
							
							
								
								LLM: Support quantize kv cache in mistral. ( #10261 )  
							
							 
							
							... 
							
							
							
							* init
* update quantize kv. 
							
						 
						
							2024-02-28 14:08:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zhicun 
								
							 
						 
						
							
							
							
							
								
							
							
								308e637d0d 
								
							 
						 
						
							
							
								
								Add DeepSeek-MoE-16B-Chat ( #10155 )  
							
							 
							
							... 
							
							
							
							* dsmoe-hf add
* add dsmoe pytorch
* update README
* modify comment
* remove GPU example
* update model name
* format code 
							
						 
						
							2024-02-28 10:12:09 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yang Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								c581c6db30 
								
							 
						 
						
							
							
								
								draft mmint4 ( #10031 )  
							
							 
							
							... 
							
							
							
							change to llm.cpp
support transposed format
revert
implement qkv fuse
fix style
change to vertically pack
change to enable_xetla
fix mlp_fusion_check
remove comments
address comments
add some comments
fix style 
							
						 
						
							2024-02-27 14:55:16 -08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								b4fa4ab46f 
								
							 
						 
						
							
							
								
								optimize yuan 2.0 again ( #10252 )  
							
							 
							
							
							
						 
						
							2024-02-27 14:51:42 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								36a9e88104 
								
							 
						 
						
							
							
								
								Speculative Starcoder on CPU ( #10138 )  
							
							 
							
							... 
							
							
							
							* Speculative Starcoder on CPU
* enable kv-cache pre-allocation
* refine codes
* refine
* fix style
* fix style
* fix style
* refine
* refine
* Update speculative.py
* Update gptbigcode.py
* fix style
* Update speculative.py
* enable mixed-datatype layernorm on top of torch API
* adaptive dtype
* Update README.md 
							
						 
						
							2024-02-27 09:57:29 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								a47989c860 
								
							 
						 
						
							
							
								
								optimize yuan 2.0 performance ( #10244 )  
							
							 
							
							
							
						 
						
							2024-02-26 17:20:10 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								6c74b99a28 
								
							 
						 
						
							
							
								
								LLM: Update qwen readme ( #10245 )  
							
							 
							
							
							
						 
						
							2024-02-26 17:03:09 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								f9b75f900b 
								
							 
						 
						
							
							
								
								LLM: Enable qwen target_model ipex ( #10232 )  
							
							 
							
							... 
							
							
							
							* change order
* enable qwen ipex
* update qwen example
* update
* fix style
* update 
							
						 
						
							2024-02-26 16:41:12 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								e38e29511c 
								
							 
						 
						
							
							
								
								[LLM] Yuan2 MLP and Rotary optimization ( #10231 )  
							
							 
							
							... 
							
							
							
							* Add optimization for rotary embedding
* Add mlp fused optimizatgion
* Python style fix
* Fix rotary embedding due to logits difference
* Small fix 
							
						 
						
							2024-02-26 15:10:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									SONG Ge 
								
							 
						 
						
							
							
							
							
								
							
							
								df2f3885ba 
								
							 
						 
						
							
							
								
								[LLM] Enable kv_cache and forward_qkv optimizations for yuan2 ( #10225 )  
							
							 
							
							... 
							
							
							
							* add init kv_cache support for yuan2
* add forward qkv in yuan 
							
						 
						
							2024-02-26 11:29:48 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								28513f3978 
								
							 
						 
						
							
							
								
								LLM: support fp16 embedding & add mlp fusion for iq2_xxs ( #10219 )  
							
							 
							
							... 
							
							
							
							* add fp16 embed
* small fixes
* fix style
* fix style
* fix comment 
							
						 
						
							2024-02-23 17:26:24 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								eeecd9fc08 
								
							 
						 
						
							
							
								
								Python style fix ( #10230 )  
							
							 
							
							
							
						 
						
							2024-02-23 17:21:23 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								e511bbd8f1 
								
							 
						 
						
							
							
								
								[LLM] Add basic optimization framework for Yuan2 ( #10227 )  
							
							 
							
							... 
							
							
							
							* Add basic optimization framework for Yuan2
* Small fix
* Python style fix
* Small fix
* Small fix 
							
						 
						
							2024-02-23 17:05:00 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								30795bdfbc 
								
							 
						 
						
							
							
								
								Gemma optimization: rms_norm, kv_cache, fused_rope, fused_rope+qkv ( #10212 )  
							
							 
							
							... 
							
							
							
							* gemma optimization
* update
* update
* fix style
* meet code review 
							
						 
						
							2024-02-23 10:07:24 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guoqiong Song 
								
							 
						 
						
							
							
							
							
								
							
							
								63681af97e 
								
							 
						 
						
							
							
								
								falcon for transformers 4.36 ( #9960 )  
							
							 
							
							... 
							
							
							
							* falcon for transformers 4.36 
							
						 
						
							2024-02-22 17:04:40 -08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								ce5840a8b7 
								
							 
						 
						
							
							
								
								GPT-J rope optimization on xpu ( #10182 )  
							
							 
							
							... 
							
							
							
							* optimize
* update
* fix style & move use_fuse_rope
* add ipex version check
* fix style
* update
* fix style
* meet comments
* address comments
* fix style 
							
						 
						
							2024-02-22 16:25:12 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xiangyu Tian 
								
							 
						 
						
							
							
							
							
								
							
							
								f445217d02 
								
							 
						 
						
							
							
								
								LLM: Update IPEX to 2.2.0+cpu and Refactor for _ipex_optimize ( #10189 )  
							
							 
							
							... 
							
							
							
							Update IPEX to 2.2.0+cpu and refactor for _ipex_optimize. 
							
						 
						
							2024-02-22 16:01:11 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								c876d9b5ca 
								
							 
						 
						
							
							
								
								Support for MPT rotary embedding ( #10208 )  
							
							 
							
							
							
						 
						
							2024-02-22 15:16:31 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								5e1fee5e05 
								
							 
						 
						
							
							
								
								LLM: add GGUF-IQ2 examples ( #10207 )  
							
							 
							
							... 
							
							
							
							* add iq2 examples
* small fix
* meet code review
* fix
* meet review
* small fix 
							
						 
						
							2024-02-22 14:18:45 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									SONG Ge 
								
							 
						 
						
							
							
							
							
								
							
							
								ca1166a0e5 
								
							 
						 
						
							
							
								
								[LLM] Add quantize kv_cache for Baichuan2-13B ( #10203 )  
							
							 
							
							... 
							
							
							
							* add quantize kv_cache for baichuan2-13b
* style fix 
							
						 
						
							2024-02-22 13:43:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								34ee1aa91f 
								
							 
						 
						
							
							
								
								LLM: add esimd sdp support for chatglm3 ( #10205 )  
							
							 
							
							... 
							
							
							
							* add esimd sdp support
* fix style 
							
						 
						
							2024-02-22 13:37:16 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								f7c96b19ef 
								
							 
						 
						
							
							
								
								LLM: support iq2 for mixtral ( #10191 )  
							
							 
							
							... 
							
							
							
							* support name mapping for mixtral
* support mixtral mixed quantization
* fix style
* fix 
							
						 
						
							2024-02-21 16:00:29 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								56ad781f2f 
								
							 
						 
						
							
							
								
								qwen2 cpu fix ( #10187 )  
							
							 
							
							
							
						 
						
							2024-02-21 11:23:51 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zhao Changmin 
								
							 
						 
						
							
							
							
							
								
							
							
								4fbf449c2d 
								
							 
						 
						
							
							
								
								for rwkv4 ( #10179 )  
							
							 
							
							
							
						 
						
							2024-02-21 10:11:10 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								3288acb8de 
								
							 
						 
						
							
							
								
								LLM : Support embedding quantization (only q2k now) ( #10170 )  
							
							 
							
							... 
							
							
							
							* basic logic added
* basic support
* support save&load, update mixed strategy
* fix style
* use int8 for lm_head
* add check for xpu 
							
						 
						
							2024-02-20 16:56:57 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
							
							
								
							
							
								2bb96c775c 
								
							 
						 
						
							
							
								
								LLM: fix device setting during saving optimized model ( #10154 )  
							
							 
							
							
							
						 
						
							2024-02-20 09:52:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								1f6d5b9f30 
								
							 
						 
						
							
							
								
								enable fused rmsnorm and rope qwen2 ( #10163 )  
							
							 
							
							... 
							
							
							
							* qwen2
* change convert
* cleanup 
							
						 
						
							2024-02-20 08:33:09 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zhao Changmin 
								
							 
						 
						
							
							
							
							
								
							
							
								f8730e8dc1 
								
							 
						 
						
							
							
								
								Skip rescale rwkv linear when load_low_bit ( #10164 )  
							
							 
							
							... 
							
							
							
							* rwkv_ld 
							
						 
						
							2024-02-19 15:56:42 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								3e2af5ec0a 
								
							 
						 
						
							
							
								
								Fix IPEX Baichuan Speculative ( #10162 )  
							
							 
							
							... 
							
							
							
							* Fix IPEX Baichuan Speculative
* compatible with 13B
* Update speculative.py 
							
						 
						
							2024-02-19 15:27:34 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								23c91cdce6 
								
							 
						 
						
							
							
								
								[LLM] Add min_step_draft in speculative decoding ( #10142 )  
							
							 
							
							... 
							
							
							
							* Fix gptj kvcache & position id
* Add min_draft_tokens in speculative decoding
* fix style
* update 
							
						 
						
							2024-02-19 14:31:41 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								f2417e083c 
								
							 
						 
						
							
							
								
								LLM: enable chatglm3-6b target_model ipex ( #10085 )  
							
							 
							
							... 
							
							
							
							* init
* always make casual_mask
* not return last tensor
* update
* optimize_model = False
* enable optimized=False
* enable optimized_model=true
* speed_up ipex target_model
* remove if True
* use group_size
* update python style
* update
* update 
							
						 
						
							2024-02-19 13:38:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								1508d6b089 
								
							 
						 
						
							
							
								
								Fix gptj kvcache & position id ( #10141 )  
							
							 
							
							
							
						 
						
							2024-02-18 10:02:49 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								4d33aac7f9 
								
							 
						 
						
							
							
								
								quick fix qwen2 fp8 kv cache ( #10135 )  
							
							 
							
							
							
						 
						
							2024-02-08 17:04:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cengguang Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								39d90839aa 
								
							 
						 
						
							
							
								
								LLM: add quantize kv cache for llama. ( #10086 )  
							
							 
							
							... 
							
							
							
							* feat: add quantize kv cache for llama.
* fix style.
* add quantized attention forward function.
* revert style.
* fix style.
* fix style.
* update quantized kv cache and add quantize_qkv
* fix style.
* fix style.
* optimize quantize kv cache.
* fix style. 
							
						 
						
							2024-02-08 16:49:22 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								d848efe17c 
								
							 
						 
						
							
							
								
								add quantize kv cache support for qwen2 ( #10134 )  
							
							 
							
							
							
						 
						
							2024-02-08 16:17:21 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									SONG Ge 
								
							 
						 
						
							
							
							
							
								
							
							
								3f79128ed7 
								
							 
						 
						
							
							
								
								[LLM] Enable kv_cache optimization for Qwen2 on transformers-v4.37.0 ( #10131 )  
							
							 
							
							... 
							
							
							
							* add support for kv_cache optimization on transformers-v4.37.0
* enable attention forward
* style fix
* disable rotary for now 
							
						 
						
							2024-02-08 14:20:26 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								063dc145ac 
								
							 
						 
						
							
							
								
								LLM: basic support for q2k ( #10132 )  
							
							 
							
							... 
							
							
							
							* basic support for q2k
* fix style 
							
						 
						
							2024-02-08 13:52:01 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cengguang Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								0cf6a12691 
								
							 
						 
						
							
							
								
								LLM: add default torch_dtype for fp16. ( #10124 )  
							
							 
							
							... 
							
							
							
							* set default torch_dtype for fp16.
* fix style.
* bug fix.
* update bug fix. 
							
						 
						
							2024-02-08 10:24:16 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								1aa0c623ce 
								
							 
						 
						
							
							
								
								disable fused layer norm on UHD ( #10130 )  
							
							 
							
							
							
						 
						
							2024-02-08 10:20:01 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								a8450fc300 
								
							 
						 
						
							
							
								
								[LLM] Support MLP optimization for Qwen1.5 ( #10123 )  
							
							 
							
							
							
						 
						
							2024-02-08 09:15:34 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
							
							
								
							
							
								925f82107e 
								
							 
						 
						
							
							
								
								LLM: support models hosted by modelscope ( #10106 )  
							
							 
							
							
							
						 
						
							2024-02-07 16:46:36 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xiangyu Tian 
								
							 
						 
						
							
							
							
							
								
							
							
								8953acd7d6 
								
							 
						 
						
							
							
								
								[LLM] Fix log condition for BIGDL_OPT_IPEX ( #10115 )  
							
							 
							
							... 
							
							
							
							Fix log condition for BIGDL_OPT_IPEX 
							
						 
						
							2024-02-07 10:27:10 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								518ef95abc 
								
							 
						 
						
							
							
								
								Small fix for Nonetype error ( #10104 )  
							
							 
							
							
							
						 
						
							2024-02-06 14:58:52 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								d61f4905ac 
								
							 
						 
						
							
							
								
								LLM: 2bit quantization initial support ( #10042 )  
							
							 
							
							... 
							
							
							
							* basis quantize support
* fix new module name
* small update
* and mixed int4 with iq2_xxs
* remove print
* code refactor
* fix style
* meet code review 
							
						 
						
							2024-02-06 14:58:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Jiao Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								33b9e7744d 
								
							 
						 
						
							
							
								
								fix dimension ( #10097 )  
							
							 
							
							
							
						 
						
							2024-02-05 15:07:38 -08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zhicun 
								
							 
						 
						
							
							
							
							
								
							
							
								7d2be7994f 
								
							 
						 
						
							
							
								
								add phixtral and optimize phi-moe ( #10052 )  
							
							 
							
							
							
						 
						
							2024-02-05 11:12:47 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zhicun 
								
							 
						 
						
							
							
							
							
								
							
							
								676d6923f2 
								
							 
						 
						
							
							
								
								LLM: modify transformersembeddings.embed() in langchain ( #10051 )  
							
							 
							
							
							
						 
						
							2024-02-05 10:42:10 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Jin Qiao 
								
							 
						 
						
							
							
							
							
								
							
							
								ad050107b3 
								
							 
						 
						
							
							
								
								LLM: fix mpt load_low_bit issue ( #10075 )  
							
							 
							
							... 
							
							
							
							* fix
* retry
* retry 
							
						 
						
							2024-02-05 10:17:07 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								8e33cb0f38 
								
							 
						 
						
							
							
								
								LLM: support speecht5_tts ( #10077 )  
							
							 
							
							... 
							
							
							
							* support speecht5_tts
* fix 
							
						 
						
							2024-02-04 13:26:42 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									ivy-lv11 
								
							 
						 
						
							
							
							
							
								
							
							
								428b7105f6 
								
							 
						 
						
							
							
								
								Add HF and PyTorch example InternLM2 ( #10061 )  
							
							 
							
							
							
						 
						
							2024-02-04 10:25:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								77be19bb97 
								
							 
						 
						
							
							
								
								LLM: Support gpt-j in speculative decoding ( #10067 )  
							
							 
							
							... 
							
							
							
							* gptj
* support gptj in speculative decoding
* fix
* update readme
* small fix 
							
						 
						
							2024-02-02 14:54:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								6e0f1a1e92 
								
							 
						 
						
							
							
								
								use apply_rotary_pos_emb_cache_freq_xpu in mixtral ( #10060 )  
							
							 
							
							... 
							
							
							
							* use apply_rotary_pos_emb_cache_freq_xpu in mixtral
* fix style 
							
						 
						
							2024-02-01 15:40:49 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								601024f418 
								
							 
						 
						
							
							
								
								Mistral CPU example of speculative decoding ( #10024 )  
							
							 
							
							... 
							
							
							
							* Mistral CPU example of speculative decoding
* update transformres version
* update example
* Update README.md 
							
						 
						
							2024-02-01 10:52:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								968e70544d 
								
							 
						 
						
							
							
								
								Enable IPEX Mistral in Speculative ( #10059 )  
							
							 
							
							
							
						 
						
							2024-02-01 10:48:16 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								3ca03d4e97 
								
							 
						 
						
							
							
								
								Add deepmind sample into bigdl-llm speculative decoding ( #10041 )  
							
							 
							
							... 
							
							
							
							* migrate deepmind sample
* update
* meet comments
* fix style
* fix style 
							
						 
						
							2024-02-01 09:57:02 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								7e5cd42a5c 
								
							 
						 
						
							
							
								
								LLM : Update optimize ipex bf16 ( #10038 )  
							
							 
							
							... 
							
							
							
							* use 4.35.2 and remove
* update rmsnorm
* remove
* remove
* update python style
* update
* update python style
* update
* fix style
* update
* remove whitespace 
							
						 
						
							2024-01-31 10:59:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								3685622f29 
								
							 
						 
						
							
							
								
								LLM: fix llama 4.36 forward( #10047 )  
							
							 
							
							
							
						 
						
							2024-01-31 10:31:10 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								53a5140eff 
								
							 
						 
						
							
							
								
								Optimize rwkv v5 rest token again ( #10043 )  
							
							 
							
							
							
						 
						
							2024-01-31 10:01:11 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								6b63ba23d1 
								
							 
						 
						
							
							
								
								LLM: add full module name during convert ( #10035 )  
							
							 
							
							
							
						 
						
							2024-01-30 14:43:07 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								7dfa6dbe46 
								
							 
						 
						
							
							
								
								add rwkv time shift optimization ( #10032 )  
							
							 
							
							
							
						 
						
							2024-01-30 14:10:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xiangyu Tian 
								
							 
						 
						
							
							
							
							
								
							
							
								f57d0fda8b 
								
							 
						 
						
							
							
								
								[LLM] Use IPEX Optimization for Self Speculative Decoding ( #9997 )  
							
							 
							
							... 
							
							
							
							Use IPEX Optimization for Self Speculative Decoding 
							
						 
						
							2024-01-30 09:11:06 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								ccf8f613fb 
								
							 
						 
						
							
							
								
								LLM: update fp16 Linear on ARC/FLEX ( #10023 )  
							
							 
							
							
							
						 
						
							2024-01-29 18:25:26 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Shaojun Liu 
								
							 
						 
						
							
							
							
							
								
							
							
								824c8029d7 
								
							 
						 
						
							
							
								
								Fix "local variable 'model' referenced before assignment" ( #10022 )  
							
							 
							
							
							
						 
						
							2024-01-29 16:18:04 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xiangyu Tian 
								
							 
						 
						
							
							
							
							
								
							
							
								f37e4702bc 
								
							 
						 
						
							
							
								
								[LLM] Use IPEX Optimization for BF16 Model ( #9988 )  
							
							 
							
							... 
							
							
							
							Use IPEX Optimization for BF16 Model by env BIGDL_OPT_IPEX=true 
							
						 
						
							2024-01-29 11:28:25 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								d720554d43 
								
							 
						 
						
							
							
								
								simplify quantize kv cache api ( #10011 )  
							
							 
							
							
							
						 
						
							2024-01-29 09:23:57 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								a3322e2a6c 
								
							 
						 
						
							
							
								
								add fp8 e5 to use_xmx ( #10015 )  
							
							 
							
							
							
						 
						
							2024-01-26 18:29:46 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Qiyuan Gong 
								
							 
						 
						
							
							
							
							
								
							
							
								9e18ea187f 
								
							 
						 
						
							
							
								
								[LLM] Avoid KV Cache OOM when seq len is larger than 1 ( #10006 )  
							
							 
							
							... 
							
							
							
							* Avoid OOM during muti-round streaming chat with kv cache
* For llama like kv cache, i.e., [bs, n_head, seq_len, head_dim], use is_enough_kv_cache_room_4_31.
* Other models need to compare kv cache size with kv_len. 
							
						 
						
							2024-01-26 17:30:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								a00efa0564 
								
							 
						 
						
							
							
								
								LLM: add mlp & qkv fusion for FP16 Llama-7B ( #9932 )  
							
							 
							
							... 
							
							
							
							* add mlp fusion for llama
* add mlp fusion
* fix style
* update
* add mm_qkv_out
* fix style
* update
* meet code review
* meet code review 
							
						 
						
							2024-01-26 11:50:38 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								98ea3459e5 
								
							 
						 
						
							
							
								
								LLM : Fix llama draft_model dtype error ( #10005 )  
							
							 
							
							... 
							
							
							
							* fix llama draft_model dtype error
* updat 
							
						 
						
							2024-01-26 10:59:48 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								aae1870096 
								
							 
						 
						
							
							
								
								fix qwen kv cache length ( #9998 )  
							
							 
							
							
							
						 
						
							2024-01-26 10:15:01 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								24b34b6e46 
								
							 
						 
						
							
							
								
								change xmx condition ( #10000 )  
							
							 
							
							
							
						 
						
							2024-01-25 17:48:11 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								bf65548d29 
								
							 
						 
						
							
							
								
								Add quantize kv cache support for chaglm2/3 ( #9996 )  
							
							 
							
							
							
						 
						
							2024-01-25 16:55:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								9bff84e6fd 
								
							 
						 
						
							
							
								
								LLM: Convert draft_model kv_cache from bf16 to fp32 ( #9964 )  
							
							 
							
							... 
							
							
							
							* convert bf16 to fp32
* update
* change when init
* init first and cut off after
* init and exchange
* update python type
* update
* fix bug
* update
* update 
							
						 
						
							2024-01-25 11:20:27 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								27338540c3 
								
							 
						 
						
							
							
								
								Fix repetition_penalty not activated issue ( #9989 )  
							
							 
							
							
							
						 
						
							2024-01-25 10:40:41 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								b27e5a27b9 
								
							 
						 
						
							
							
								
								Remove the check for meta device in _replace_with_low_bit_linear ( #9984 )  
							
							 
							
							
							
						 
						
							2024-01-24 18:15:39 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								b176cad75a 
								
							 
						 
						
							
							
								
								LLM: Add baichuan2 gpu spec example ( #9973 )  
							
							 
							
							... 
							
							
							
							* add baichuan2 gpu spec example
* update readme & example
* remove print
* fix typo
* meet comments
* revert
* update 
							
						 
						
							2024-01-24 16:40:16 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Chen, Zhentao 
								
							 
						 
						
							
							
							
							
								
							
							
								e0db44dcb6 
								
							 
						 
						
							
							
								
								fix unexpected keyword argument 'device'  ( #9982 )  
							
							 
							
							... 
							
							
							
							* add device for chatglm3 only
* add comment for this change
* fix style
* fix style
* fix style again..
* finally fixed style 
							
						 
						
							2024-01-24 13:20:46 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								8d28aa8e2b 
								
							 
						 
						
							
							
								
								[LLM] Fix the model.device problem when cpu_embedding=True ( #9971 )  
							
							 
							
							... 
							
							
							
							* Overwrite the device attribute for CPUPinnedParam
* Expose cpu_embedding=True for Linux users
* Fix python style 
							
						 
						
							2024-01-23 18:51:11 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								f82782cd3b 
								
							 
						 
						
							
							
								
								fix starcoder ( #9975 )  
							
							 
							
							
							
						 
						
							2024-01-23 17:24:53 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								2c8a9aaf0d 
								
							 
						 
						
							
							
								
								fix qwen causal mask when quantize_kv_cache=True ( #9968 )  
							
							 
							
							
							
						 
						
							2024-01-23 16:34:05 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								36c665667d 
								
							 
						 
						
							
							
								
								Add logits processor & qwen eos stop in speculative decoding ( #9963 )  
							
							 
							
							... 
							
							
							
							* add logits processor & qwen eos
* fix style
* fix
* fix
* fix style
* fix style
* support transformers 4.31
* fix style
* fix style
---------
Co-authored-by: rnwang04 <ruonan1.wang@intel.com> 
							
						 
						
							2024-01-23 15:57:28 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								da4687c917 
								
							 
						 
						
							
							
								
								fix fp16 ( #9970 )  
							
							 
							
							
							
						 
						
							2024-01-23 15:53:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								27b19106f3 
								
							 
						 
						
							
							
								
								LLM: add readme for speculative decoding gpu examples ( #9961 )  
							
							 
							
							... 
							
							
							
							* add readme
* add readme
* meet code review 
							
						 
						
							2024-01-23 12:54:19 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Chen, Zhentao 
								
							 
						 
						
							
							
							
							
								
							
							
								39219b7e9a 
								
							 
						 
						
							
							
								
								add default device meta  when lcmu enabled ( #9941 )  
							
							 
							
							
							
						 
						
							2024-01-23 11:00:49 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								dacf680294 
								
							 
						 
						
							
							
								
								add fused rotary pos emb for qwen ( #9956 )  
							
							 
							
							... 
							
							
							
							* add fused rotary pos emb for qwen
* update 
							
						 
						
							2024-01-23 10:37:56 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								7b1d9ad7c0 
								
							 
						 
						
							
							
								
								LLM: limit esimd sdp usage for k_len < 8 ( #9959 )  
							
							 
							
							... 
							
							
							
							* update
* fix 
							
						 
						
							2024-01-23 09:28:23 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								3e601f9a5d 
								
							 
						 
						
							
							
								
								LLM: Support speculative decoding in bigdl-llm ( #9951 )  
							
							 
							
							... 
							
							
							
							* first commit
* fix error, add llama example
* hidden print
* update api usage
* change to api v3
* update
* meet code review
* meet code review, fix style
* add reference, fix style
* fix style
* fix first token time 
							
						 
						
							2024-01-22 19:14:56 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								fb91c97fe8 
								
							 
						 
						
							
							
								
								support for Baichuan/Baichuan2 13B Chat running speculative decoding ( #9921 )  
							
							 
							
							... 
							
							
							
							* support for Baichuan/Baichuan2 13B Chat running speculative decoding
* fix stype 
							
						 
						
							2024-01-22 09:11:44 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								97f0cd8975 
								
							 
						 
						
							
							
								
								optimize Decilm 7b ( #9922 )  
							
							 
							
							... 
							
							
							
							* optimize deci
* update
* decilm attension forward 
							
						 
						
							2024-01-19 17:31:13 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								bcaeb05272 
								
							 
						 
						
							
							
								
								Update optimize qwen ( #9943 )  
							
							 
							
							... 
							
							
							
							* update for n tokens input
* fix dtype
* update 
							
						 
						
							2024-01-19 16:54:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								bf37b3a670 
								
							 
						 
						
							
							
								
								LLM: optimize CPU speculative decoding of chatglm3 ( #9928 )  
							
							 
							
							... 
							
							
							
							* update
* fix style
* meet code review 
							
						 
						
							2024-01-19 14:10:22 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Shaojun Liu 
								
							 
						 
						
							
							
							
							
								
							
							
								967714bac8 
								
							 
						 
						
							
							
								
								gguf memory optimization for mixtral ( #9939 )  
							
							 
							
							
							
						 
						
							2024-01-19 11:13:15 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Lilac09 
								
							 
						 
						
							
							
							
							
								
							
							
								7032a2ad73 
								
							 
						 
						
							
							
								
								Optimize gguf load memory for mistral ( #9923 )  
							
							 
							
							... 
							
							
							
							* optimize gguf load for mistral
* fix output of gguf mistral
* reset 
							
						 
						
							2024-01-19 09:14:39 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Shaojun Liu 
								
							 
						 
						
							
							
							
							
								
							
							
								9a46f019d7 
								
							 
						 
						
							
							
								
								gguf memory optimization for baichuan ( #9937 )  
							
							 
							
							
							
						 
						
							2024-01-19 09:11:02 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
							
							
								
							
							
								2e1448f08e 
								
							 
						 
						
							
							
								
								[Serving] Add vllm_worker to fastchat serving framework ( #9934 )  
							
							 
							
							... 
							
							
							
							* add worker
* finish
* finish
* add license
* add more comments 
							
						 
						
							2024-01-18 21:33:36 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								7bbb98abb6 
								
							 
						 
						
							
							
								
								Disable fused layer norm when using XMX to fix mpt UT ( #9933 )  
							
							 
							
							
							
						 
						
							2024-01-18 16:22:12 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								1fc9dfa265 
								
							 
						 
						
							
							
								
								LLM: Update for  Qwen n tokens inputs ( #9931 )  
							
							 
							
							... 
							
							
							
							* update for n tokens inputs
* update style
* update 
							
						 
						
							2024-01-18 15:56:29 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								5184f400f9 
								
							 
						 
						
							
							
								
								Fix Mixtral GGUF Wrong Output Issue ( #9930 )  
							
							 
							
							... 
							
							
							
							* Fix Mixtral GGUF Wrong Output Issue
* fix style
* fix style 
							
						 
						
							2024-01-18 14:11:27 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								453df868c9 
								
							 
						 
						
							
							
								
								add rwkv v5 attention kernel ( #9927 )  
							
							 
							
							
							
						 
						
							2024-01-18 10:16:29 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								054952f82f 
								
							 
						 
						
							
							
								
								LLM: Fix rope of chatglm3 to support speculative decoding on CPU ( #9926 )  
							
							 
							
							
							
						 
						
							2024-01-18 09:28:10 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ziteng Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								18cd1f1432 
								
							 
						 
						
							
							
								
								[LLM]Solve the problem of calling bmm operator in BF16Linear ( #9924 )  
							
							 
							
							... 
							
							
							
							* Solve the problem of calling bmm operator in BF16Linear 
							
						 
						
							2024-01-17 18:08:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								98b86f83d4 
								
							 
						 
						
							
							
								
								Support fast rope for training ( #9745 )  
							
							 
							
							... 
							
							
							
							* init
* init
* fix style
* add test and fix
* address comment
* update
* merge upstream main 
							
						 
						
							2024-01-17 15:51:38 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								427f75000b 
								
							 
						 
						
							
							
								
								LLM: fix sdp of chatglm3 ( #9917 )  
							
							 
							
							... 
							
							
							
							* fix
* fix
* fix 
							
						 
						
							2024-01-17 13:37:28 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								94767da7cf 
								
							 
						 
						
							
							
								
								optimize rwkv v4 first token performance ( #9912 )  
							
							 
							
							
							
						 
						
							2024-01-17 09:27:41 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Shaojun Liu 
								
							 
						 
						
							
							
							
							
								
							
							
								b909c5c9c2 
								
							 
						 
						
							
							
								
								GGUF load memory optimization ( #9913 )  
							
							 
							
							... 
							
							
							
							* block-wise
* convert linear for module
* revert
* Fix PEP8 checks Error 
							
						 
						
							2024-01-16 18:54:39 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								dee32f7d15 
								
							 
						 
						
							
							
								
								copy fused rms norm's reuslt to avoid <unk> ( #9909 )  
							
							 
							
							
							
						 
						
							2024-01-16 16:54:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								8d7326ae03 
								
							 
						 
						
							
							
								
								LLM: fix chatglm3 sdp to support speculative decoding ( #9900 )  
							
							 
							
							... 
							
							
							
							* fix chatglm3
* fix
* update
* meet code review
* fix 
							
						 
						
							2024-01-16 11:29:13 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
							
							
								
							
							
								9f34da7cdb 
								
							 
						 
						
							
							
								
								Update PVC XMX condition ( #9901 )  
							
							 
							
							... 
							
							
							
							* update pvc xmx condition
* update condition
* update conditon 
							
						 
						
							2024-01-15 15:42:15 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								6637860ddf 
								
							 
						 
						
							
							
								
								change xmx condition ( #9896 )  
							
							 
							
							
							
						 
						
							2024-01-12 19:51:48 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								d9cf55bce9 
								
							 
						 
						
							
							
								
								LLM: fix MLP check of mixtral ( #9891 )  
							
							 
							
							
							
						 
						
							2024-01-11 18:01:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ziteng Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								4af88a67b9 
								
							 
						 
						
							
							
								
								support chatglm3 with bf16 ( #9888 )  
							
							 
							
							... 
							
							
							
							* support chatglm3 with bigdl-bf16 
							
						 
						
							2024-01-11 16:45:21 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
							
							
								
							
							
								0aef35a965 
								
							 
						 
						
							
							
								
								[LLM] Improve LLM doc regarding windows gpu related info ( #9880 )  
							
							 
							
							... 
							
							
							
							* Improve runtime configuration for windows
* Add python 310/311 supports for wheel downloading
* Add troubleshooting for windows gpu
* Remove manually import ipex due to auto importer
* Add info regarding cpu_embedding=True on iGPU
* More info for Windows users
* Small updates to API docs
* Python style fix
* Remove tip for loading from saved optimize_model for now
* Updated based on comments
* Update win info for multi-intel gpus selection
* Small fix
* Small fix 
							
						 
						
							2024-01-11 14:37:16 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								53531ae4ee 
								
							 
						 
						
							
							
								
								LLM: support qkv fusion for fp8e5 ( #9878 )  
							
							 
							
							... 
							
							
							
							* update
* add mistral
* meet code review 
							
						 
						
							2024-01-10 17:50:00 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Lilac09 
								
							 
						 
						
							
							
							
							
								
							
							
								cb32b985ec 
								
							 
						 
						
							
							
								
								add mistral and chatglm support to vllm ( #9879 )  
							
							 
							
							... 
							
							
							
							* add mistral and chatglm support to vllm
* add mistral and chatglm support to vllm 
							
						 
						
							2024-01-10 15:38:42 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								3e05c9e11b 
								
							 
						 
						
							
							
								
								LLM: update esimd sdp kernel ( #9871 )  
							
							 
							
							
							
						 
						
							2024-01-09 18:10:01 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								36496d60ac 
								
							 
						 
						
							
							
								
								only use quantize kv cache on MTL ( #9862 )  
							
							 
							
							
							
						 
						
							2024-01-09 13:24:02 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									ZehuaCao 
								
							 
						 
						
							
							
							
							
								
							
							
								146076bdb5 
								
							 
						 
						
							
							
								
								Support llm-awq backend ( #9856 )  
							
							 
							
							... 
							
							
							
							* Support for LLM-AWQ Backend
* fix
* Update README.md
* Add awqconfig
* modify init
* update
* support llm-awq
* fix style
* fix style
* update
* fix AwqBackendPackingMethod not found error
* fix style
* update README
* fix style
---------
Co-authored-by: Uxito-Ada <414416158@qq.com>
Co-authored-by: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com>
Co-authored-by: cyita <yitastudy@gmail.com> 
							
						 
						
							2024-01-09 13:07:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								fea6f16057 
								
							 
						 
						
							
							
								
								LLM: add mlp fusion for fp8e5 and update related check ( #9860 )  
							
							 
							
							... 
							
							
							
							* update mlp fusion
* fix style
* update 
							
						 
						
							2024-01-09 09:56:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Jiao Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								3b6372ab12 
								
							 
						 
						
							
							
								
								Fix Llama transformers 4.36 support ( #9852 )  
							
							 
							
							... 
							
							
							
							* supoort 4.36
* style
* update
* update
* update
* fix merge
* update 
							
						 
						
							2024-01-08 00:32:23 -08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Chen, Zhentao 
								
							 
						 
						
							
							
							
							
								
							
							
								1b585b0d40 
								
							 
						 
						
							
							
								
								set fp8 default as e5m2 ( #9859 )  
							
							 
							
							
							
						 
						
							2024-01-08 15:53:57 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								dc995006cc 
								
							 
						 
						
							
							
								
								LLM: add flash attention for mistral / mixtral ( #9846 )  
							
							 
							
							... 
							
							
							
							* add flash attention for mistral
* update
* add flash attn for mixtral
* fix style 
							
						 
						
							2024-01-08 09:51:34 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								afaa871144 
								
							 
						 
						
							
							
								
								[LLM] support quantize kv cache to fp8 ( #9812 )  
							
							 
							
							
							
						 
						
							2024-01-08 09:28:20 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Jiao Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								248ae7fad2 
								
							 
						 
						
							
							
								
								LLama optimize_model to support transformers 4.36 ( #9818 )  
							
							 
							
							... 
							
							
							
							* supoort 4.36
* style
* update
* update
* update 
							
						 
						
							2024-01-05 11:30:18 -08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								a60bda3324 
								
							 
						 
						
							
							
								
								LLM: update check for deepspeed ( #9838 )  
							
							 
							
							
							
						 
						
							2024-01-05 16:44:10 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								16433dd959 
								
							 
						 
						
							
							
								
								LLM: fix first token judgement of flash attention ( #9841 )  
							
							 
							
							... 
							
							
							
							* fix flash attention
* meet code review
* fix 
							
						 
						
							2024-01-05 13:49:37 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								f919f5792a 
								
							 
						 
						
							
							
								
								fix kv cache out of bound ( #9827 )  
							
							 
							
							
							
						 
						
							2024-01-05 12:38:57 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								5df31db773 
								
							 
						 
						
							
							
								
								LLM: fix accuracy issue of chatglm3 ( #9830 )  
							
							 
							
							... 
							
							
							
							* add attn mask for first token
* fix
* fix
* change attn calculation
* fix
* fix
* fix style
* fix style 
							
						 
						
							2024-01-05 10:52:05 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xiangyu Tian 
								
							 
						 
						
							
							
							
							
								
							
							
								38c05be1c0 
								
							 
						 
						
							
							
								
								[LLM] Fix dtype mismatch in Baichuan2-13b ( #9834 )  
							
							 
							
							
							
						 
						
							2024-01-04 15:34:42 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ziteng Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								05b681fa85 
								
							 
						 
						
							
							
								
								[LLM] IPEX auto importer set on by default ( #9832 )  
							
							 
							
							... 
							
							
							
							* Set BIGDL_IMPORT_IPEX default to True
* Remove import intel_extension_for_pytorch as ipex from GPU example 
							
						 
						
							2024-01-04 13:33:29 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								4ceefc9b18 
								
							 
						 
						
							
							
								
								LLM: Support bitsandbytes config on qlora finetune ( #9715 )  
							
							 
							
							... 
							
							
							
							* test support bitsandbytesconfig
* update style
* update cpu example
* update example
* update readme
* update unit test
* use bfloat16
* update logic
* use int4
* set defalut bnb_4bit_use_double_quant
* update
* update example
* update model.py
* update
* support lora example 
							
						 
						
							2024-01-04 11:23:16 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								20e9742fa0 
								
							 
						 
						
							
							
								
								LLM: fix chatglm3 issue ( #9820 )  
							
							 
							
							... 
							
							
							
							* fix chatglm3 issue
* small update 
							
						 
						
							2024-01-03 16:15:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								a54cd767b1 
								
							 
						 
						
							
							
								
								LLM: Add gguf falcon ( #9801 )  
							
							 
							
							... 
							
							
							
							* init falcon
* update convert.py
* update style 
							
						 
						
							2024-01-03 14:49:02 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Qiyuan Gong 
								
							 
						 
						
							
							
							
							
								
							
							
								f0f9d45eac 
								
							 
						 
						
							
							
								
								[LLM] IPEX import support bigdl-core-xe-21 ( #9769 )  
							
							 
							
							... 
							
							
							
							Add support for bigdl-core-xe-21. 
							
						 
						
							2023-12-28 15:23:58 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
							
							
								
							
							
								5857a38321 
								
							 
						 
						
							
							
								
								[vLLM] Add option to adjust KV_CACHE_ALLOC_BLOCK_LENGTH ( #9782 )  
							
							 
							
							... 
							
							
							
							* add option kv_cache_block
* change var name 
							
						 
						
							2023-12-28 14:41:47 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								99bddd3ab4 
								
							 
						 
						
							
							
								
								LLM: better FP16 support for Intel GPUs ( #9791 )  
							
							 
							
							... 
							
							
							
							* initial support
* fix
* fix style
* fix
* limi esimd usage condition
* refactor code
* fix style
* small fix
* meet code review
* small fix 
							
						 
						
							2023-12-28 13:30:13 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								7d9f6c6efc 
								
							 
						 
						
							
							
								
								fix cpuinfo error ( #9793 )  
							
							 
							
							
							
						 
						
							2023-12-28 09:23:44 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								7ed9538b9f 
								
							 
						 
						
							
							
								
								LLM: support gguf mpt ( #9773 )  
							
							 
							
							... 
							
							
							
							* add gguf mpt
* update 
							
						 
						
							2023-12-28 09:22:39 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Cengguang Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								d299f108d0 
								
							 
						 
						
							
							
								
								update falcon attention forward. ( #9796 )  
							
							 
							
							
							
						 
						
							2023-12-28 09:11:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Kai Huang 
								
							 
						 
						
							
							
							
							
								
							
							
								689889482c 
								
							 
						 
						
							
							
								
								Reduce max_cache_pos to reduce Baichuan2-13B memory ( #9694 )  
							
							 
							
							... 
							
							
							
							* optimize baichuan2 memory
* fix
* style
* fp16 mask
* disable fp16
* fix style
* empty cache
* revert empty cache 
							
						 
						
							2023-12-26 19:51:25 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xiangyu Tian 
								
							 
						 
						
							
							
							
							
								
							
							
								0ea842231e 
								
							 
						 
						
							
							
								
								[LLM] vLLM: Add api_server entrypoint ( #9783 )  
							
							 
							
							... 
							
							
							
							Add vllm.entrypoints.api_server for benchmark_serving.py in vllm. 
							
						 
						
							2023-12-26 16:03:57 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								11d883301b 
								
							 
						 
						
							
							
								
								LLM: fix wrong batch output caused by flash attention ( #9780 )  
							
							 
							
							... 
							
							
							
							* fix
* meet code review
* move batch size check to the beginning
* move qlen check inside function
* meet code review 
							
						 
						
							2023-12-26 09:41:27 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								66e286a73d 
								
							 
						 
						
							
							
								
								Support for Mixtral AWQ ( #9775 )  
							
							 
							
							... 
							
							
							
							* Support for Mixtral AWQ
* Update README.md
* Update README.md
* Update awq_config.py
* Update README.md
* Update README.md 
							
						 
						
							2023-12-25 16:08:09 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								1917bbe626 
								
							 
						 
						
							
							
								
								LLM: fix BF16Linear related training & inference issue ( #9755 )  
							
							 
							
							... 
							
							
							
							* fix bf16 related issue
* fix
* update based on comment & add arc lora script
* update readme
* update based on comment
* update based on comment
* update
* force to bf16
* fix style
* move check input dtype into function
* update convert
* meet code review
* meet code review
* update merged model to support new training_mode api
* fix typo 
							
						 
						
							2023-12-25 14:49:30 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xiangyu Tian 
								
							 
						 
						
							
							
							
							
								
							
							
								30dab36f76 
								
							 
						 
						
							
							
								
								[LLM] vLLM: Fix kv cache init ( #9771 )  
							
							 
							
							... 
							
							
							
							Fix kv cache init 
							
						 
						
							2023-12-25 14:17:06 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								449b387125 
								
							 
						 
						
							
							
								
								Support relora in bigdl-llm ( #9687 )  
							
							 
							
							... 
							
							
							
							* init
* fix style
* update
* support resume & update readme
* update
* update
* remove important
* add training mode
* meet comments 
							
						 
						
							2023-12-25 14:04:28 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ziteng Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								986f65cea9 
								
							 
						 
						
							
							
								
								[LLM] Add trust_remote_code for local renamed model in bigdl_llm_model.py ( #9762 )  
							
							 
							
							
							
						 
						
							2023-12-25 11:31:14 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
							
							
								
							
							
								daf536fb2d 
								
							 
						 
						
							
							
								
								vLLM: Apply attention optimizations for selective batching ( #9758 )  
							
							 
							
							... 
							
							
							
							* fuse_rope for prefil
* apply kv_cache optimizations
* apply fast_decoding_path
* Re-enable kv_cache optimizations for prefill
* reduce KV_CACHE_ALLOC_BLOCK for selective_batching 
							
						 
						
							2023-12-25 10:29:31 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Qiyuan Gong 
								
							 
						 
						
							
							
							
							
								
							
							
								4c487313f2 
								
							 
						 
						
							
							
								
								Revert "[LLM] IPEX auto importer turn on by default for XPU ( #9730 )" ( #9759 )  
							
							 
							
							... 
							
							
							
							This reverts commit 0284801fbd . 
							
						 
						
							2023-12-22 16:38:24 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Qiyuan Gong 
								
							 
						 
						
							
							
							
							
								
							
							
								0284801fbd 
								
							 
						 
						
							
							
								
								[LLM] IPEX auto importer turn on by default for XPU ( #9730 )  
							
							 
							
							... 
							
							
							
							* Set BIGDL_IMPORT_IPEX default to true, i.e., auto import IPEX for XPU.
* Remove import intel_extension_for_pytorch as ipex from GPU example.
* Add support for bigdl-core-xe-21. 
							
						 
						
							2023-12-22 16:20:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
							
							
								
							
							
								fdf93c9267 
								
							 
						 
						
							
							
								
								Implement selective batching for vLLM ( #9659 )  
							
							 
							
							... 
							
							
							
							* add control to load hf model
* finish initial version of selective_batching
* temp
* finish
* Remove print statement
* fix error
* Apply yang's optimization
* a version that works
* We need to check kv_cache passed in, this could be an error. TODO: add fast decoding path
* format
* temp solution: not batching prefill requests
* a version that works for prefill batching
* format
* a solid version: works normally
* a temp version
* Solid version: remove redundant functions
* fix format
* format
* solid: add option to enable selective_batching
* remove logic for using transformer models
* format
* format
* solid: enable argument VLLM_ENABLE_SELECTIVE_BATCHING
* format
* finish
* format 
							
						 
						
							2023-12-22 13:45:46 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								2f36769208 
								
							 
						 
						
							
							
								
								LLM: bigdl-llm lora support & lora example ( #9740 )  
							
							 
							
							... 
							
							
							
							* lora support and single card example
* support multi-card, refactor code
* fix model id and style
* remove torch patch, add two new class for bf16, update example
* fix style
* change to training_mode
* small fix
* add more info in help
* fixstyle, update readme
* fix ut
* fix ut
* Handling compatibility issues with default LoraConfig 
							
						 
						
							2023-12-22 11:05:39 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									SONG Ge 
								
							 
						 
						
							
							
							
							
								
							
							
								ba0b939579 
								
							 
						 
						
							
							
								
								[LLM] Support transformers-v4.36.0 on mistral model ( #9744 )  
							
							 
							
							... 
							
							
							
							* add support transformers-v4.36.0 on mistral model
* python/llm/src/bigdl/llm/transformers/models/mistral.py
* make the redundant implementation as utils
* fix code style
* fix
* fix style
* update with utils enough_kv_room 
							
						 
						
							2023-12-22 09:59:27 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								e36111e713 
								
							 
						 
						
							
							
								
								mixstral fused qkv and rope ( #9724 )  
							
							 
							
							... 
							
							
							
							* mixstral fused qkv and rope
* fix and clean
* fix style
* update
* update
* fix
* update
* fix 
							
						 
						
							2023-12-22 09:26:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Jiao Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								e4f6e43675 
								
							 
						 
						
							
							
								
								safetenor to false ( #9728 )  
							
							 
							
							
							
						 
						
							2023-12-21 14:41:51 -08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								426660b88e 
								
							 
						 
						
							
							
								
								simplify qwen attention ( #9747 )  
							
							 
							
							
							
						 
						
							2023-12-21 17:53:29 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								984697afe2 
								
							 
						 
						
							
							
								
								LLM: Add bloom gguf support ( #9734 )  
							
							 
							
							... 
							
							
							
							* init
* update bloom add merges
* update
* update readme
* update for llama error
* update 
							
						 
						
							2023-12-21 14:06:25 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								df775cf316 
								
							 
						 
						
							
							
								
								fix python style ( #9742 )  
							
							 
							
							... 
							
							
							
							* fix python style
* fix
* fix 
							
						 
						
							2023-12-21 11:25:05 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xin Qiu 
								
							 
						 
						
							
							
							
							
								
							
							
								6c3e698bf1 
								
							 
						 
						
							
							
								
								mistral decoding_fast_path and fused mlp ( #9714 )  
							
							 
							
							... 
							
							
							
							* mistral decoding_fast_path and fused mlp
* meet code review 
							
						 
						
							2023-12-21 10:11:37 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
							
							
								
							
							
								d157f623b6 
								
							 
						 
						
							
							
								
								Load Mixtral gguf in a block-wise way ( #9725 )  
							
							 
							
							... 
							
							
							
							* Load Mixtral gguf in a block-wise way
* refine 
							
						 
						
							2023-12-21 10:03:23 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zhao Changmin 
								
							 
						 
						
							
							
							
							
								
							
							
								4bda975a3e 
								
							 
						 
						
							
							
								
								LLM: Align lowbit model config ( #9735 )  
							
							 
							
							... 
							
							
							
							* align lowbit model config 
							
						 
						
							2023-12-21 09:48:58 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
							
							
								
							
							
								e1e921f425 
								
							 
						 
						
							
							
								
								LLM: gguf other model using dtype ( #9729 )  
							
							 
							
							
							
						 
						
							2023-12-21 09:33:40 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
							
							
								
							
							
								13ea6330bd 
								
							 
						 
						
							
							
								
								optimize qwen rope ( #9737 )  
							
							 
							
							
							
						 
						
							2023-12-20 17:34:34 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ziteng Zhang 
								
							 
						 
						
							
							
							
							
								
							
							
								4c032a433e 
								
							 
						 
						
							
							
								
								[LLM] Add glibc checker ( #9624 )  
							
							 
							
							... 
							
							
							
							* Add glibc checker
* Add env BIGDL_GLIBC_CHECK to control glibc checker. The default is false, i.e., don't check. 
							
						 
						
							2023-12-20 16:52:43 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
							
							
								
							
							
								cd652a1710 
								
							 
						 
						
							
							
								
								Support fp8 e5m2 on arc ( #9711 )  
							
							 
							
							... 
							
							
							
							* init
* fix style
* update
* fix style
* update 
							
						 
						
							2023-12-20 16:26:17 +08:00