Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								f8d1adc573 
								
							 
						 
						
							
							
								
								Fix Llama 3.2 & 3.1 on LNL ( #12196 )  
							
							 
							
							
							
						 
						
							2024-10-14 17:39:20 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zijie Li 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								7d80db710e 
								
							 
						 
						
							
							
								
								Add benchmark_util for transformers >= 4.44.0 ( #12171 )  
							
							 
							
							... 
							
							
							
							* Create benchmark_util_4_45.py
* Update __init__.py
* Update lint-python
* Update benchmark_util_4_45.py
* Update benchmark_util_4_45.py
* Create benchmark_util_4_44.py 
							
						 
						
							2024-10-14 15:40:12 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								310f18c8af 
								
							 
						 
						
							
							
								
								update NPU pipeline generate ( #12182 )  
							
							 
							
							... 
							
							
							
							* update
* fix style 
							
						 
						
							2024-10-11 17:39:20 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								4d93bb81fe 
								
							 
						 
						
							
							
								
								Initial support of NPU level0 Model ( #12177 )  
							
							 
							
							... 
							
							
							
							* first commit to support load dll and init llm pipeline
* add init generate
* fix style
* small updates
* fix style and check tokens number 
							
						 
						
							2024-10-11 09:45:53 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								890662610b 
								
							 
						 
						
							
							
								
								Fix auto importer for LNL release ( #12175 )  
							
							 
							
							
							
						 
						
							2024-10-10 15:17:43 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								535bee5381 
								
							 
						 
						
							
							
								
								fix qwen2 vl again ( #12174 )  
							
							 
							
							
							
						 
						
							2024-10-10 13:50:01 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								78d253165d 
								
							 
						 
						
							
							
								
								optimize qwen2 vl perf again ( #12167 )  
							
							 
							
							
							
						 
						
							2024-10-09 16:43:48 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								644af2a76e 
								
							 
						 
						
							
							
								
								add basic llama 3.2 vision support ( #12163 )  
							
							 
							
							
							
						 
						
							2024-10-08 10:46:48 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								669ff1a97b 
								
							 
						 
						
							
							
								
								fix sd1.5 ( #12129 )  
							
							 
							
							
							
						 
						
							2024-09-26 17:15:16 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								a266528719 
								
							 
						 
						
							
							
								
								optimize llama 3.2 rope ( #12128 )  
							
							 
							
							
							
						 
						
							2024-09-26 16:08:10 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								584c3489e7 
								
							 
						 
						
							
							
								
								add basic support for llama3.2 ( #12125 )  
							
							 
							
							
							
						 
						
							2024-09-26 15:46:19 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								66f419f8b7 
								
							 
						 
						
							
							
								
								fix qwen2 vl ( #12126 )  
							
							 
							
							
							
						 
						
							2024-09-26 15:44:02 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								77af9bc5fa 
								
							 
						 
						
							
							
								
								support passing None to low_bit in optimize_model ( #12121 )  
							
							 
							
							
							
						 
						
							2024-09-26 11:09:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								47e0b83cbf 
								
							 
						 
						
							
							
								
								optimize sd 1.5 ( #12119 )  
							
							 
							
							
							
						 
						
							2024-09-25 15:45:13 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								5d63aef60b 
								
							 
						 
						
							
							
								
								optimize qwen2 vl again ( #12109 )  
							
							 
							
							
							
						 
						
							2024-09-23 13:22:01 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								03bd01c99c 
								
							 
						 
						
							
							
								
								optimize npu qwen2 ( #12107 )  
							
							 
							
							
							
						 
						
							2024-09-20 19:46:16 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								9239fd4f12 
								
							 
						 
						
							
							
								
								add basic support and optimization for qwen2-vl ( #12104 )  
							
							 
							
							
							
						 
						
							2024-09-20 17:23:06 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								828fa01ad3 
								
							 
						 
						
							
							
								
								[NPU] Add mixed_precision for Qwen2 7B ( #12098 )  
							
							 
							
							... 
							
							
							
							* Add mix_precision argument to control whether use INT8 lm_head for Qwen2-7B-Instruct
* Small fix
* Fixed on load low bit with mixed precision
* Small fix
* Update example accordingly
* Update for default prompt
* Update base on comments
* Final fix 
							
						 
						
							2024-09-20 16:36:21 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								09b8c80d9d 
								
							 
						 
						
							
							
								
								update code for NPU qwen2 ( #12094 )  
							
							 
							
							... 
							
							
							
							* update code
* fix 
							
						 
						
							2024-09-20 15:58:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								54b973c744 
								
							 
						 
						
							
							
								
								fix ipex_llm import in transformers 4.45 ( #12099 )  
							
							 
							
							
							
						 
						
							2024-09-20 15:24:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								f7fb3c896c 
								
							 
						 
						
							
							
								
								Update lm_head optimization for Qwen2 7B ( #12090 )  
							
							 
							
							
							
						 
						
							2024-09-18 17:02:02 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								40e463c66b 
								
							 
						 
						
							
							
								
								Enable vllm load gptq model ( #12083 )  
							
							 
							
							... 
							
							
							
							* enable vllm load gptq model
* update
* update
* update
* update style 
							
						 
						
							2024-09-18 14:41:00 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								081af41def 
								
							 
						 
						
							
							
								
								[NPU] Optimize Qwen2 lm_head to use INT4 ( #12072 )  
							
							 
							
							... 
							
							
							
							* temp save
* update
* fix
* fix
* Split lm_head into 7 parts & remove int8 for lm_head when sym_int4
* Simlify and add condition to code
* Small fix
* refactor some code
* fix style
* fix style
* fix style
* fix
* fix
* temp sav e
* refactor
* fix style
* further refactor
* simplify code
* meet code review
* fix style
---------
Co-authored-by: Yuwen Hu <yuwen.hu@intel.com> 
							
						 
						
							2024-09-14 15:26:46 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ch1y0q 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								b4b8c3e495 
								
							 
						 
						
							
							
								
								add lowbit_path for generate.py, fix npu_model ( #12077 )  
							
							 
							
							... 
							
							
							
							* add `lowbit_path` for `generate.py`, fix `npu_model`
* update `README.md` 
							
						 
						
							2024-09-13 17:28:05 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								d703e4f127 
								
							 
						 
						
							
							
								
								Enable vllm multimodal minicpm-v-2-6 ( #12074 )  
							
							 
							
							... 
							
							
							
							* enable minicpm-v-2-6
* add image_url readme 
							
						 
						
							2024-09-13 13:28:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Jinhe 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								4ca330da15 
								
							 
						 
						
							
							
								
								Fix NPU load error message and add minicpm npu lowbit feat ( #12064 )  
							
							 
							
							... 
							
							
							
							* fix npu_model raise sym_int4 error
* add load_lowbit
* remove print&perf 
							
						 
						
							2024-09-11 16:56:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								a0c73c26d8 
								
							 
						 
						
							
							
								
								clean NPU code ( #12060 )  
							
							 
							
							... 
							
							
							
							* clean code
* remove time.perf_counter() 
							
						 
						
							2024-09-11 15:10:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								c75f3dd874 
								
							 
						 
						
							
							
								
								vllm no padding glm4 to avoid nan error ( #12062 )  
							
							 
							
							... 
							
							
							
							* no padding glm4
* add codegeex 
							
						 
						
							2024-09-11 13:44:40 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								30a8680645 
								
							 
						 
						
							
							
								
								Update for vllm one card padding ( #12058 )  
							
							 
							
							
							
						 
						
							2024-09-11 10:52:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								d8c044e79d 
								
							 
						 
						
							
							
								
								optimize minicpm3 kv cache ( #12052 )  
							
							 
							
							
							
						 
						
							2024-09-10 16:51:21 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								5d3ab16a80 
								
							 
						 
						
							
							
								
								Add vllm glm and baichuan padding ( #12053 )  
							
							 
							
							
							
						 
						
							2024-09-10 15:57:28 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								69c8d36f16 
								
							 
						 
						
							
							
								
								Switching from vLLM v0.3.3 to vLLM 0.5.4 ( #12042 )  
							
							 
							
							... 
							
							
							
							* Enable single card sync engine
* enable ipex-llm optimizations for vllm
* enable optimizations for lm_head
* Fix chatglm multi-reference problem
* Remove duplicate layer
* LLM: Update vLLM to v0.5.4 (#11746 )
* Enable single card sync engine
* enable ipex-llm optimizations for vllm
* enable optimizations for lm_head
* Fix chatglm multi-reference problem
* update 0.5.4 api_server
* add dockerfile
* fix
* fix
* refine
* fix
---------
Co-authored-by: gc-fu <guancheng.fu@intel.com>
* Add vllm-0.5.4 Dockerfile (#11838 )
* Update BIGDL_LLM_SDP_IGNORE_MASK in start-vllm-service.sh (#11957 )
* Fix vLLM not convert issues (#11817 ) (#11918 )
* Fix not convert issues
* refine
Co-authored-by: Guancheng Fu <110874468+gc-fu@users.noreply.github.com>
* Fix glm4-9b-chat nan error on vllm 0.5.4 (#11969 )
* init
* update mlp forward
* fix minicpm error in vllm 0.5.4
* fix dependabot alerts (#12008 )
* Update 0.5.4 dockerfile (#12021 )
* Add vllm awq loading logic (#11987 )
* [ADD] Add vllm awq loading logic
* [FIX] fix the module.linear_method path
* [FIX] fix quant_config path error
* Enable Qwen padding mlp to 256 to support batch_forward (#12030 )
* Enable padding mlp
* padding to 256
* update style
* Install 27191 runtime in 0.5.4 docker image (#12040 )
* fix rebase error
* fix rebase error
* vLLM: format for 0.5.4 rebase (#12043 )
* format
* Update model_convert.py
* Fix serving docker related modifications (#12046 )
* Fix undesired modifications (#12048 )
* fix
* Refine offline_inference arguments
---------
Co-authored-by: Xiangyu Tian <109123695+xiangyuT@users.noreply.github.com>
Co-authored-by: Jun Wang <thoughts.times@gmail.com>
Co-authored-by: Wang, Jian4 <61138589+hzjane@users.noreply.github.com>
Co-authored-by: liu-shaojun <johnssalyn@outlook.com>
Co-authored-by: Shaojun Liu <61072813+liu-shaojun@users.noreply.github.com> 
							
						 
						
							2024-09-10 15:37:43 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								dc4af02b2a 
								
							 
						 
						
							
							
								
								Fix qwen2 1.5B NPU load error ( #12049 )  
							
							 
							
							
							
						 
						
							2024-09-10 14:41:18 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								abc370728c 
								
							 
						 
						
							
							
								
								optimize minicpm3 again ( #12047 )  
							
							 
							
							
							
						 
						
							2024-09-10 14:19:57 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ch1y0q 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								f0061a9916 
								
							 
						 
						
							
							
								
								remove local import os to fix Baichuan NPU load issue ( #12044 )  
							
							 
							
							
							
						 
						
							2024-09-10 14:13:24 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								640998edea 
								
							 
						 
						
							
							
								
								update inter_pp of qwen2 ( #12041 )  
							
							 
							
							
							
						 
						
							2024-09-10 10:34:17 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								048b4590aa 
								
							 
						 
						
							
							
								
								add basic minicpm3 optimization ( #12039 )  
							
							 
							
							
							
						 
						
							2024-09-09 17:25:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								6cedb601e4 
								
							 
						 
						
							
							
								
								remove some useless code ( #12035 )  
							
							 
							
							
							
						 
						
							2024-09-06 17:51:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								d2e1b9aaff 
								
							 
						 
						
							
							
								
								Add input padding during prefill for qwen2-7b ( #12033 )  
							
							 
							
							
							
						 
						
							2024-09-06 16:39:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								0d04531ae0 
								
							 
						 
						
							
							
								
								update NPU readme of Qwen2 ( #12032 )  
							
							 
							
							... 
							
							
							
							* update readme
* update broadcast 
							
						 
						
							2024-09-06 15:02:39 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yang Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								58555bd9de 
								
							 
						 
						
							
							
								
								Optimize broadcast for npu llama ( #12028 )  
							
							 
							
							
							
						 
						
							2024-09-06 13:28:20 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								845e5dc89e 
								
							 
						 
						
							
							
								
								Support lm_head of minicpm-2b on NPU ( #12019 )  
							
							 
							
							
							
						 
						
							2024-09-05 16:19:22 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guoqiong Song 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								8803242f5c 
								
							 
						 
						
							
							
								
								fix llama on cpu ( #12018 )  
							
							 
							
							
							
						 
						
							2024-09-04 19:17:54 -07:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								b3b2cd64b4 
								
							 
						 
						
							
							
								
								Support lightweight-serving glm-4v-9b  ( #11994 )  
							
							 
							
							... 
							
							
							
							* enable glm-4v-9b serving
* update readme
* update for no image input 
							
						 
						
							2024-09-05 09:25:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								2b993ad479 
								
							 
						 
						
							
							
								
								vllm update for glm-4 model automatic not_convert ( #12003 )  
							
							 
							
							
							
						 
						
							2024-09-04 13:50:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								9eaff5e47d 
								
							 
						 
						
							
							
								
								add save &  load support for NPU optimized model ( #11999 )  
							
							 
							
							... 
							
							
							
							* add save &  load support
* fix style 
							
						 
						
							2024-09-03 20:53:22 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								6eb55653ba 
								
							 
						 
						
							
							
								
								Performance mode strategy update for input_embeds input ( #11997 )  
							
							 
							
							
							
						 
						
							2024-09-03 17:46:16 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								01099f08ee 
								
							 
						 
						
							
							
								
								Revert prefill logic of qwen2-7b ( #11992 )  
							
							 
							
							
							
						 
						
							2024-09-03 14:45:01 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								659d15defc 
								
							 
						 
						
							
							
								
								Fix wrong attention mask and garbage output for inputs_embeds inputs during lookup generation ( #11989 )  
							
							 
							
							... 
							
							
							
							* Fix garbage output for input_embeds inputs during lookup generation
* Fix on sliding windows
* Simplify code 
							
						 
						
							2024-09-02 19:09:12 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								2f3d1bd0ec 
								
							 
						 
						
							
							
								
								hotfix qwen2-7b weight setting ( #11991 )  
							
							 
							
							
							
						 
						
							2024-09-02 18:11:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								a40ea7038d 
								
							 
						 
						
							
							
								
								Fix AttributeError of qwen2-1.5B ( #11990 )  
							
							 
							
							
							
						 
						
							2024-09-02 17:55:10 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yang Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								c48817bd43 
								
							 
						 
						
							
							
								
								Support Qwen2-7b MLP in int4 and transpose_value_cache=True ( #11968 )  
							
							 
							
							
							
						 
						
							2024-09-02 14:37:44 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								573c20bae6 
								
							 
						 
						
							
							
								
								fix npu lm_head cpu condition ( #11976 )  
							
							 
							
							... 
							
							
							
							* fix
* fix
* fix
* fix stype
* fix style
* fix style 
							
						 
						
							2024-08-30 17:11:26 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								60aa1a2c0f 
								
							 
						 
						
							
							
								
								Initial NPU support for MiniCPM-V-2_6 ( #11966 )  
							
							 
							
							... 
							
							
							
							* initial pr
* update npu model
* fix
* fix kv cache type
* fix
* small fix
* fix style
* fix model id
* change inter_pp=4
* address comment
* fix
* fix style
* fix
* rebase 
							
						 
						
							2024-08-30 16:34:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									SONG Ge 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								158289d205 
								
							 
						 
						
							
							
								
								[NPU] Add initial support for minicpm-llama-v2.5 ( #11962 )  
							
							 
							
							... 
							
							
							
							* add initial support for minicpm-llama-v2.5
* update impl
* add minicpm-llama3-v2.5 example 
							
						 
						
							2024-08-30 16:00:33 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								cd077881f1 
								
							 
						 
						
							
							
								
								Disable lm head ( #11972 )  
							
							 
							
							
							
						 
						
							2024-08-30 11:05:18 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								7d103417b8 
								
							 
						 
						
							
							
								
								Fix glm4-9b-chat nan error on vllm 0.3.3 ( #11970 )  
							
							 
							
							... 
							
							
							
							* fix nan value
* update 
							
						 
						
							2024-08-30 09:50:18 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yang Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								fbf088f61e 
								
							 
						 
						
							
							
								
								remove obselete npu code ( #11967 )  
							
							 
							
							
							
						 
						
							2024-08-29 14:16:44 -07:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								a9e485eb1b 
								
							 
						 
						
							
							
								
								Support MiniCPM-V-2_6 multi-modal benchmarking with latency text streamer ( #11963 )  
							
							 
							
							... 
							
							
							
							* Support MiniCPM-V-2_6 multi-modal benchmarking with latency text streamer
* Style fixes 
							
						 
						
							2024-08-29 19:22:09 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								882f4a5ff7 
								
							 
						 
						
							
							
								
								Add lnl npu driver recommend version and enable cpu_lm_head on llama3 ( #11952 )  
							
							 
							
							... 
							
							
							
							* update lnl npu driver version and enable cpu_lm_head on llama3
* update
* fix style
* typo
* address comments
* update
* add qwen2-7b 
							
						 
						
							2024-08-29 15:01:18 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								71f03dcc39 
								
							 
						 
						
							
							
								
								Support qwen2-7b with fused decoderlayer optimization on NPU ( #11912 )  
							
							 
							
							
							
						 
						
							2024-08-29 13:34:20 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Jiao Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								63ac5f64bb 
								
							 
						 
						
							
							
								
								Refactor NPU baichuan multiple-process ( #11945 )  
							
							 
							
							... 
							
							
							
							* update
* add baichuan mp
* clean
* refactor
* merge
* style
* update
* update 
							
						 
						
							2024-08-28 11:33:40 -07:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									SONG Ge 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								5ca7390082 
								
							 
						 
						
							
							
								
								[NPU] Add minicpm-2b support for npu multi-processing ( #11949 )  
							
							 
							
							... 
							
							
							
							* add minicpm-2b support
* update example for minicpm-2b
* add LNL NPU driver requirement in readme 
							
						 
						
							2024-08-28 18:08:49 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								0fbb10259a 
								
							 
						 
						
							
							
								
								use sdp_causal to reduce internvl2-4b memory usage if set environment variable ( #11953 )  
							
							 
							
							
							
						 
						
							2024-08-28 17:35:05 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								0a7bd274e2 
								
							 
						 
						
							
							
								
								Add vllm awq loading logic ( #11950 )  
							
							 
							
							... 
							
							
							
							* add vllm awq loading logic
* fix
* refine 
							
						 
						
							2024-08-28 16:46:18 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								b38fb67bec 
								
							 
						 
						
							
							
								
								[NPU] lm head to cpu ( #11943 )  
							
							 
							
							... 
							
							
							
							* lm head to cpu
* qwen2
* mv logic and add param to disable cpu_lm_head
* use env and lm_head opt to mp file
* fix
* update
* remove print 
							
						 
						
							2024-08-28 16:34:07 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								bec00e2015 
								
							 
						 
						
							
							
								
								Improve baichuan2 NPU performance ( #11942 )  
							
							 
							
							
							
						 
						
							2024-08-27 18:37:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zijie Li 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								90f692937d 
								
							 
						 
						
							
							
								
								Update npu baichuan2 ( #11939 )  
							
							 
							
							
							
						 
						
							2024-08-27 16:56:26 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Jiao Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								b4b6ddf73c 
								
							 
						 
						
							
							
								
								NPU Baichuan2 Multi- Process example ( #11928 )  
							
							 
							
							
							
						 
						
							2024-08-27 15:25:49 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									SONG Ge 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								e211a5b076 
								
							 
						 
						
							
							
								
								update minicpm to meet latest refactor ( #11937 )  
							
							 
							
							
							
						 
						
							2024-08-27 15:08:01 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								7c8c9a0670 
								
							 
						 
						
							
							
								
								Update benchmark script for NPU ( #11932 )  
							
							 
							
							
							
						 
						
							2024-08-27 14:41:14 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zijie Li 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								6c3eb1e1e8 
								
							 
						 
						
							
							
								
								refactor from_pretrained API for NPU ( #11927 )  
							
							 
							
							
							
						 
						
							2024-08-27 09:50:30 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xiangyu Tian 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								7ca557aada 
								
							 
						 
						
							
							
								
								LLM: Fix vLLM CPU convert error ( #11926 )  
							
							 
							
							
							
						 
						
							2024-08-27 09:22:19 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								c1d07bc626 
								
							 
						 
						
							
							
								
								Support streaming for lookup generation ( #11922 )  
							
							 
							
							... 
							
							
							
							* Support streaming for lookup generation
* Small update
* Style fixes
* Add origin generate full back for batch inference and beam search; support input length threshold judgement for directly input with input_ids
* Fix lookup stream generate with eos token
* Small fixes
* Small fix
* index fix
* Small fix 
							
						 
						
							2024-08-26 19:33:31 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									SONG Ge 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								019f725d4d 
								
							 
						 
						
							
							
								
								[NPU] Add support for running mp minicpm model on npu ( #11909 )  
							
							 
							
							... 
							
							
							
							* add initial support for npu minicpm mp
* fix minicpm-1b abnormal output error 
							
						 
						
							2024-08-26 17:52:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								24c279e0ae 
								
							 
						 
						
							
							
								
								Update IPEX_LLM_PERFORMANCE_MODE with input length threshold ( #11908 )  
							
							 
							
							... 
							
							
							
							* Update IPEX_LLM_PERFORMANCE_MODE with input length threshold
* Update based on comments. And and judgement for inputs_embeds
* Fix for benchmarking purposes
* Update based on comments
* Small fix 
							
						 
						
							2024-08-23 20:49:15 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								303a090a6b 
								
							 
						 
						
							
							
								
								Add lm_head optimization on NPU ( #11903 )  
							
							 
							
							
							
						 
						
							2024-08-23 15:51:07 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								23631cd357 
								
							 
						 
						
							
							
								
								disable lm_head opt for baichuan2-13b ( #11905 )  
							
							 
							
							
							
						 
						
							2024-08-23 15:39:47 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									hxsz1997 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								650e6e6ce4 
								
							 
						 
						
							
							
								
								Merge pull request  #11891  from hxsz1997/baichuan2-compresskv  
							
							 
							
							... 
							
							
							
							Add compress_kv for Baichuan2 
							
						 
						
							2024-08-23 06:09:58 +03:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								4a61f7d20d 
								
							 
						 
						
							
							
								
								update mlp of llama ( #11897 )  
							
							 
							
							... 
							
							
							
							* update mlp of llama
* relax threshold of  mlp test
* revert code 
							
						 
						
							2024-08-22 20:34:53 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								420ce7d164 
								
							 
						 
						
							
							
								
								Fix non-stop at eos token problem for lookup generation ( #11896 )  
							
							 
							
							... 
							
							
							
							* Fix non-stop by eos_token_id problem for lookup
* Small fix
* Add judgement when generation_config.eos_token_id is None
* Fix based on comments 
							
						 
						
							2024-08-22 18:55:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								4cf03d6212 
								
							 
						 
						
							
							
								
								update baichuan-7b  
							
							 
							
							
							
						 
						
							2024-08-22 18:16:33 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								278b191dc1 
								
							 
						 
						
							
							
								
								Fix optimize lm head error ( #11899 )  
							
							 
							
							
							
						 
						
							2024-08-22 17:45:26 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								5c4ed00593 
								
							 
						 
						
							
							
								
								Add lightweight-serving whisper asr example ( #11847 )  
							
							 
							
							... 
							
							
							
							* add asr init
* update for pp
* update style
* update readme
* update reamde 
							
						 
						
							2024-08-22 15:46:28 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								eb1e65f8a9 
								
							 
						 
						
							
							
								
								add comment  
							
							 
							
							
							
						 
						
							2024-08-22 15:14:47 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								a2be3d7501 
								
							 
						 
						
							
							
								
								add comment of compress kv in attention forward  
							
							 
							
							
							
						 
						
							2024-08-22 15:11:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								ce7de77085 
								
							 
						 
						
							
							
								
								add comment of change in model forward  
							
							 
							
							
							
						 
						
							2024-08-22 14:29:27 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								42398a0045 
								
							 
						 
						
							
							
								
								add comment  
							
							 
							
							
							
						 
						
							2024-08-22 13:17:13 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								48a827aa07 
								
							 
						 
						
							
							
								
								fix typos  
							
							 
							
							
							
						 
						
							2024-08-22 11:35:47 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								8a5df93de2 
								
							 
						 
						
							
							
								
								fix typos  
							
							 
							
							
							
						 
						
							2024-08-22 11:33:07 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								01ed397e7a 
								
							 
						 
						
							
							
								
								fix typos  
							
							 
							
							
							
						 
						
							2024-08-22 11:31:25 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								c6ed1c412d 
								
							 
						 
						
							
							
								
								fix typos  
							
							 
							
							
							
						 
						
							2024-08-22 11:26:49 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								2a0aa9271b 
								
							 
						 
						
							
							
								
								fix typos  
							
							 
							
							
							
						 
						
							2024-08-22 11:23:22 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								4adadddbbc 
								
							 
						 
						
							
							
								
								fix typos  
							
							 
							
							
							
						 
						
							2024-08-22 11:12:23 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								6a5ca17afc 
								
							 
						 
						
							
							
								
								fix typoes  
							
							 
							
							
							
						 
						
							2024-08-22 11:09:58 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								72a7bf624b 
								
							 
						 
						
							
							
								
								Support qwen2-1.5b with fused decoderlayer optimization on NPU ( #11888 )  
							
							 
							
							
							
						 
						
							2024-08-22 11:09:12 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								6bb9035788 
								
							 
						 
						
							
							
								
								fix typos  
							
							 
							
							
							
						 
						
							2024-08-22 11:08:48 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Huang, Xinshengzi 
								
							 
						 
						
							
							
							
							
								
							
							
								86248b0505 
								
							 
						 
						
							
							
								
								add compress_kv for baichuan2  
							
							 
							
							
							
						 
						
							2024-08-22 10:59:08 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								cc27321441 
								
							 
						 
						
							
							
								
								support chatglm4 in lookup ( #11855 )  
							
							 
							
							
							
						 
						
							2024-08-21 15:53:17 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								0236de3ac2 
								
							 
						 
						
							
							
								
								set IPEX_LLM_LAST_LM_HEAD=1 as default ( #11885 )  
							
							 
							
							
							
						 
						
							2024-08-21 15:06:12 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yang Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								209d42ab79 
								
							 
						 
						
							
							
								
								Refactor npu mp to make it easier to integrate new models ( #11873 )  
							
							 
							
							... 
							
							
							
							* Refactor npu mp to make it easier to integrate new models
* fix style
* move layer functions to base 
							
						 
						
							2024-08-20 20:58:47 -07:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								537c0d2767 
								
							 
						 
						
							
							
								
								fix vllm qwen2 models ( #11879 )  
							
							 
							
							
							
						 
						
							2024-08-21 11:05:24 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								bd1e490d62 
								
							 
						 
						
							
							
								
								fix phi3 ( #11878 )  
							
							 
							
							
							
						 
						
							2024-08-21 10:31:41 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yang Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								bdaeee1d63 
								
							 
						 
						
							
							
								
								Fix run_decoders bug ( #11871 )  
							
							 
							
							
							
						 
						
							2024-08-20 12:04:59 -07:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								c3c058373f 
								
							 
						 
						
							
							
								
								Update compresskv model forward type logic ( #11868 )  
							
							 
							
							... 
							
							
							
							* update
* fix 
							
						 
						
							2024-08-20 18:11:37 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								d4ee0a89f3 
								
							 
						 
						
							
							
								
								optimize phi3 memory usage ( #11867 )  
							
							 
							
							
							
						 
						
							2024-08-20 17:32:51 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								2946420e14 
								
							 
						 
						
							
							
								
								add minicpmv 2.6 load_low_bit workaround ( #11856 )  
							
							 
							
							
							
						 
						
							2024-08-20 11:16:02 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yang Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								99b05ba1dc 
								
							 
						 
						
							
							
								
								separate prefill into a process ( #11787 )  
							
							 
							
							... 
							
							
							
							* seperate prefill into a process
* using model.share_memory()
* might work
* worked
* use long prompt
* refactor
* cleanup
* fix bug
* clean up
* changable inter and intra process stages
* refactor
* add max output len
* fix npu_model changes that may cause generate down
* fix npu_model generate import error
* fix generare forward error
---------
Co-authored-by: sgwhat <ge.song@intel.com> 
							
						 
						
							2024-08-19 17:53:36 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								9490781aec 
								
							 
						 
						
							
							
								
								optimize phi3 memory usage again ( #11848 )  
							
							 
							
							
							
						 
						
							2024-08-19 17:26:59 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								3cd4e87168 
								
							 
						 
						
							
							
								
								Support compress KV with quantize KV ( #11812 )  
							
							 
							
							... 
							
							
							
							* update llama
* support llama 4.41
* fix style
* support minicpm
* support qwen2
* support minicpm & update
* support chatglm4
* support chatglm
* remove print
* add DynamicCompressFp8Cache & support qwen
* support llama
* support minicpm phi3
* update chatglm2/4
* small fix & support qwen 4.42
* remove print 
							
						 
						
							2024-08-19 15:32:32 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Zhao Changmin 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								6841a9ac8f 
								
							 
						 
						
							
							
								
								fix load low bit com dtype ( #11832 )  
							
							 
							
							
							
						 
						
							2024-08-19 13:43:19 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								96796f95cb 
								
							 
						 
						
							
							
								
								Update all-in-one benchmark prompts for continuation task & lookup update for minicpmv ( #11827 )  
							
							 
							
							... 
							
							
							
							* Update all-in-one benchmark prompts for continuation task
* Small fix
* Add pure-text benchmark support for minicpm-v-2_6
* Support lookahead for model.llm generate of minicpmv
* Add prompt reference
* Small update
* Small fix 
							
						 
						
							2024-08-16 17:16:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								e966e85df8 
								
							 
						 
						
							
							
								
								force lm_head optimization in any model if set environment variable ( #11830 )  
							
							 
							
							
							
						 
						
							2024-08-16 16:48:45 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								17a0beb21f 
								
							 
						 
						
							
							
								
								optimize qwen2-audio again ( #11825 )  
							
							 
							
							
							
						 
						
							2024-08-16 11:11:35 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								9e9086cc2a 
								
							 
						 
						
							
							
								
								Update IPEX_LLM_PERFORMANCE_MODE ( #11823 )  
							
							 
							
							
							
						 
						
							2024-08-16 09:48:36 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								5a80fd2633 
								
							 
						 
						
							
							
								
								Fix lightweight-serving no streaming resp on mtl ( #11822 )  
							
							 
							
							
							
						 
						
							2024-08-16 09:43:03 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Guancheng Fu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								e70ae0638e 
								
							 
						 
						
							
							
								
								Fix vLLM not convert issues ( #11817 )  
							
							 
							
							... 
							
							
							
							* Fix not convert issues
* refine 
							
						 
						
							2024-08-15 19:04:05 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								750d4ad5dc 
								
							 
						 
						
							
							
								
								fix minicpm-v-2 fp16 ( #11819 )  
							
							 
							
							
							
						 
						
							2024-08-15 18:34:40 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								828ab16537 
								
							 
						 
						
							
							
								
								fix phi3 and minicpmv cpu ( #11818 )  
							
							 
							
							
							
						 
						
							2024-08-15 17:43:29 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								4e178f0c5d 
								
							 
						 
						
							
							
								
								rewrite minicpmv optimization ( #11816 )  
							
							 
							
							
							
						 
						
							2024-08-15 17:27:12 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								07b7f13982 
								
							 
						 
						
							
							
								
								support and optimize qwen2-audio ( #11809 )  
							
							 
							
							
							
						 
						
							2024-08-15 14:59:04 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								9a93808fc5 
								
							 
						 
						
							
							
								
								fix and optimize minicpm v 2 ( #11799 )  
							
							 
							
							
							
						 
						
							2024-08-14 17:27:23 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								3d6cfa291d 
								
							 
						 
						
							
							
								
								optimize minicpm v 2.5 ( #11793 )  
							
							 
							
							
							
						 
						
							2024-08-14 16:07:24 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yuwen Hu 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								356281cb80 
								
							 
						 
						
							
							
								
								Further all-in-one benchmark update continuation task ( #11784 )  
							
							 
							
							... 
							
							
							
							* Further update prompt for continuation task, and disable lookup candidate update strategy on MTL
* style fix 
							
						 
						
							2024-08-14 14:39:34 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								43cca3be27 
								
							 
						 
						
							
							
								
								fix gemma2 runtime error caused by sliding window ( #11788 )  
							
							 
							
							... 
							
							
							
							* fix runtime error
* revert workflow 
							
						 
						
							2024-08-14 10:43:33 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yang Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								51bcac1229 
								
							 
						 
						
							
							
								
								follow up on experimental support of fused decoder layer for llama2 ( #11785 )  
							
							 
							
							... 
							
							
							
							* clean up and support transpose value cache
* refine
* fix style
* fix style 
							
						 
						
							2024-08-13 18:53:55 -07:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								cb79dcda93 
								
							 
						 
						
							
							
								
								refactor llama convert to fix minicpm-v 2.5 optimization ( #11783 )  
							
							 
							
							
							
						 
						
							2024-08-14 09:29:57 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								7cd6ec9723 
								
							 
						 
						
							
							
								
								MiniCPM-V support compresskv ( #11779 )  
							
							 
							
							... 
							
							
							
							* fix check error
* fix other models
* remove print 
							
						 
						
							2024-08-13 19:03:40 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Qiyuan Gong 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								3998de14f0 
								
							 
						 
						
							
							
								
								Fix mistral forward_qkv in q4_0 ( #11781 )  
							
							 
							
							... 
							
							
							
							* Fix mistral forward_qkv without self.rotary_emb.base in q4_0.
* Replace apply_rotary_pos_emb_no_cache_xpu with rotary_half_inplaced.
* Revert https://github.com/intel-analytics/ipex-llm/pull/11765  
							
						 
						
							2024-08-13 16:48:19 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Heyang Sun 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								70c828b87c 
								
							 
						 
						
							
							
								
								deepspeed zero3 QLoRA finetuning ( #11625 )  
							
							 
							
							... 
							
							
							
							* deepspeed zero3 QLoRA finetuning
* Update convert.py
* Update low_bit_linear.py
* Update utils.py
* Update qlora_finetune_llama2_13b_arch_2_card.sh
* Update low_bit_linear.py
* Update alpaca_qlora_finetuning.py
* Update low_bit_linear.py
* Update utils.py
* Update convert.py
* Update alpaca_qlora_finetuning.py
* Update alpaca_qlora_finetuning.py
* Update low_bit_linear.py
* Update deepspeed_zero3.json
* Update qlora_finetune_llama2_13b_arch_2_card.sh
* Update low_bit_linear.py
* Update low_bit_linear.py
* Update utils.py
* fix style
* fix style
* Update alpaca_qlora_finetuning.py
* Update qlora_finetune_llama2_13b_arch_2_card.sh
* Update convert.py
* Update low_bit_linear.py
* Update model.py
* Update alpaca_qlora_finetuning.py
* Update low_bit_linear.py
* Update low_bit_linear.py
* Update low_bit_linear.py 
							
						 
						
							2024-08-13 16:15:29 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								a184b120c9 
								
							 
						 
						
							
							
								
								fix minicpm-v 2.5 ( #11780 )  
							
							 
							
							
							
						 
						
							2024-08-13 16:14:00 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Qiyuan Gong 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								a88c132e54 
								
							 
						 
						
							
							
								
								Reduce Mistral softmax memory only in low memory mode ( #11775 )  
							
							 
							
							... 
							
							
							
							* Reduce Mistral softmax memory only in low memory mode 
							
						 
						
							2024-08-13 14:50:54 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								aa861df066 
								
							 
						 
						
							
							
								
								use new fp32 softmax kernel ( #11776 )  
							
							 
							
							
							
						 
						
							2024-08-13 14:48:11 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									binbin Deng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								23d3acdc77 
								
							 
						 
						
							
							
								
								Add experimental support of fused decoder layer for llama2 ( #11768 )  
							
							 
							
							
							
						 
						
							2024-08-13 14:41:36 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								a1eb793f70 
								
							 
						 
						
							
							
								
								optimize minicpm v 2_6 firs token perf ( #11770 )  
							
							 
							
							
							
						 
						
							2024-08-13 09:51:18 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								841dbcdf3a 
								
							 
						 
						
							
							
								
								Fix compresskv with lookahead issue ( #11767 )  
							
							 
							
							... 
							
							
							
							* fix compresskv + lookahead attn_mask qwen2
* support llama chatglm
* support mistral & chatglm
* address comments
* revert run.py 
							
						 
						
							2024-08-12 18:53:55 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xu, Shuo 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								1b05caba2b 
								
							 
						 
						
							
							
								
								Set mistral fuse rope to false except fp6 & fp16 ( #11765 )  
							
							 
							
							... 
							
							
							
							* set mistral fuse rope to false except fp6 & fp16
* lint
* lint
---------
Co-authored-by: ATMxsp01 <shou.xu@intel.com> 
							
						 
						
							2024-08-12 17:25:07 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								8db34057b4 
								
							 
						 
						
							
							
								
								optimize lookahead init time ( #11769 )  
							
							 
							
							
							
						 
						
							2024-08-12 17:19:12 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								57d177738d 
								
							 
						 
						
							
							
								
								optimize minicpm-v-2_6 repetition penalty ( #11763 )  
							
							 
							
							
							
						 
						
							2024-08-12 14:10:10 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Wang, Jian4 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								245dba0abc 
								
							 
						 
						
							
							
								
								Fix lightweight-serving codegeex error ( #11759 )  
							
							 
							
							
							
						 
						
							2024-08-12 10:35:37 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								66fe2ee464 
								
							 
						 
						
							
							
								
								initial support of IPEX_LLM_PERFORMANCE_MODE  ( #11754 )  
							
							 
							
							... 
							
							
							
							* add perf mode
* update
* fix style 
							
						 
						
							2024-08-09 19:04:09 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								4b9c57cc60 
								
							 
						 
						
							
							
								
								Support compress kv with lookahead ( #11752 )  
							
							 
							
							... 
							
							
							
							* support compress kv with lookahead
* enough kv miss param 
							
						 
						
							2024-08-09 17:39:57 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								93455aac09 
								
							 
						 
						
							
							
								
								fix minicpm V 2.6 repeat output ( #11753 )  
							
							 
							
							
							
						 
						
							2024-08-09 17:39:24 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								7e917d6cfb 
								
							 
						 
						
							
							
								
								fix gptq of llama ( #11749 )  
							
							 
							
							... 
							
							
							
							* fix gptq of llama
* small fix 
							
						 
						
							2024-08-09 16:39:25 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								dd46c141bd 
								
							 
						 
						
							
							
								
								Phi3 support compresskv ( #11733 )  
							
							 
							
							... 
							
							
							
							* phi3 support compresskv
* fix phi3 mtl error
* fix conflict with quant kv
* fix abnormal on mtl
* fix style
* use slide windows size to compress kv
* support sliding window
* fix style
* fix style
* temp: partial support quant kv
* support quant kv with compress kv, todo: model check
* temp
* fix style
* fix style
* remove prepare
* address comment
* default -> 1.8k 
							
						 
						
							2024-08-09 15:43:43 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Qiyuan Gong 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								d8808cc2e3 
								
							 
						 
						
							
							
								
								Mistral apply_rotary_pos_emb_no_cache_xpu use rope_theta from config ( #11747 )  
							
							 
							
							... 
							
							
							
							mistral-7B-instruct-v0.2 and mistral-7B-instruct-v0.1 use different rope_theta (0.2 is 1e, 0.1 is 1e5). Pass self.config.rope_theta to apply_rotary_pos_emb_no_cache_xpu to avoid output difference. 
							
						 
						
							2024-08-09 10:35:51 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Xiangyu Tian 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								044e486480 
								
							 
						 
						
							
							
								
								Fix vLLM CPU /chat endpoint ( #11748 )  
							
							 
							
							
							
						 
						
							2024-08-09 10:33:52 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yishuo Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								54cc9353db 
								
							 
						 
						
							
							
								
								support and optimize minicpm-v-2_6 ( #11738 )  
							
							 
							
							
							
						 
						
							2024-08-07 18:21:16 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Yina Chen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								e956e71fc1 
								
							 
						 
						
							
							
								
								fix conflict with quant kv ( #11737 )  
							
							 
							
							
							
						 
						
							2024-08-07 18:10:30 +08:00  
						
						
							 
							
							
								 
							 
							
						 
					 
				
					
						
							
								
								
									 
									Ruonan Wang 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								00a5574c8a 
								
							 
						 
						
							
							
								
								Use merge_qkv to replace fused_qkv for llama2 ( #11727 )  
							
							 
							
							... 
							
							
							
							* update 4.38
* support new versions
* update
* fix style
* fix style
* update rope
* temp test sdpa
* fix style
* fix cpu ut 
							
						 
						
							2024-08-07 18:04:01 +08:00