feat: change oneccl to internal (#12296)
* feat: change oneccl * fix: restore llama-70b * fix: remove tab * fix: remove extra blank * small fix * add comments * fix: add a blank space
This commit is contained in:
		
							parent
							
								
									6f22133efc
								
							
						
					
					
						commit
						29400e2e75
					
				
					 6 changed files with 13 additions and 7 deletions
				
			
		| 
						 | 
					@ -20,7 +20,8 @@ conda activate llm
 | 
				
			||||||
# below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 | 
					# below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 | 
				
			||||||
pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 | 
					pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 | 
				
			||||||
pip install transformers==4.37.0
 | 
					pip install transformers==4.37.0
 | 
				
			||||||
pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 | 
					wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.5.1-release/oneccl_wks_installer_2024.0.0.5.1.sh 
 | 
				
			||||||
 | 
					bash oneccl_wks_installer_2024.0.0.5.1.sh
 | 
				
			||||||
# configures OneAPI environment variables
 | 
					# configures OneAPI environment variables
 | 
				
			||||||
source /opt/intel/oneapi/setvars.sh
 | 
					source /opt/intel/oneapi/setvars.sh
 | 
				
			||||||
pip install git+https://github.com/microsoft/DeepSpeed.git@ed8aed5
 | 
					pip install git+https://github.com/microsoft/DeepSpeed.git@ed8aed5
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -104,7 +104,8 @@ if __name__ == '__main__':
 | 
				
			||||||
    deepspeed.comm.comm.cdb = None
 | 
					    deepspeed.comm.comm.cdb = None
 | 
				
			||||||
    from deepspeed.comm.comm import init_distributed
 | 
					    from deepspeed.comm.comm import init_distributed
 | 
				
			||||||
    init_distributed()
 | 
					    init_distributed()
 | 
				
			||||||
 | 
					    from ipex_llm.utils import BenchmarkWrapper
 | 
				
			||||||
 | 
					    model = BenchmarkWrapper(model)
 | 
				
			||||||
    print(model)
 | 
					    print(model)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Load tokenizer
 | 
					    # Load tokenizer
 | 
				
			||||||
| 
						 | 
					@ -135,7 +136,7 @@ if __name__ == '__main__':
 | 
				
			||||||
            actual_output_len = output.shape[1] - input_ids.shape[1]
 | 
					            actual_output_len = output.shape[1] - input_ids.shape[1]
 | 
				
			||||||
            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
					            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
				
			||||||
            avg_time = (end - st) / actual_output_len * 1000
 | 
					            avg_time = (end - st) / actual_output_len * 1000
 | 
				
			||||||
            print(f'Inference time of generating {actual_output_len} tokens: {end-st} s, average token latency is {avg_time} ms/token.')
 | 
					            print(f'Inference time of generating {actual_output_len} tokens: {end-st} s, first token cost {model.first_cost} s, rest tokens average cost {model.rest_cost_mean} s')
 | 
				
			||||||
            print('-'*20, 'Prompt', '-'*20)
 | 
					            print('-'*20, 'Prompt', '-'*20)
 | 
				
			||||||
            print(prompt)
 | 
					            print(prompt)
 | 
				
			||||||
            print('-'*20, 'Output', '-'*20)
 | 
					            print('-'*20, 'Output', '-'*20)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -24,7 +24,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 | 
				
			||||||
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 | 
					export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 | 
				
			||||||
basekit_root=/opt/intel/oneapi
 | 
					basekit_root=/opt/intel/oneapi
 | 
				
			||||||
source $basekit_root/setvars.sh --force
 | 
					source $basekit_root/setvars.sh --force
 | 
				
			||||||
source $basekit_root/ccl/latest/env/vars.sh --force
 | 
					# source $basekit_root/ccl/latest/env/vars.sh --force   deprecate oneccl_bind_pt and use internal oneccl for better performance
 | 
				
			||||||
 | 
					source /opt/intel/1ccl-wks/setvars.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export OMP_NUM_THREADS=$((56/$NUM_GPUS))
 | 
					export OMP_NUM_THREADS=$((56/$NUM_GPUS))
 | 
				
			||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 | 
					export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,7 +22,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 | 
				
			||||||
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 | 
					export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 | 
				
			||||||
basekit_root=/opt/intel/oneapi
 | 
					basekit_root=/opt/intel/oneapi
 | 
				
			||||||
source $basekit_root/setvars.sh --force
 | 
					source $basekit_root/setvars.sh --force
 | 
				
			||||||
source $basekit_root/ccl/latest/env/vars.sh --force
 | 
					# source $basekit_root/ccl/latest/env/vars.sh --force   deprecate oneccl_bind_pt and use internal oneccl for better performance
 | 
				
			||||||
 | 
					source /opt/intel/1ccl-wks/setvars.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NUM_GPUS=2 # number of used GPU
 | 
					NUM_GPUS=2 # number of used GPU
 | 
				
			||||||
export USE_XETLA=OFF
 | 
					export USE_XETLA=OFF
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,7 +25,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 | 
				
			||||||
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 | 
					export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 | 
				
			||||||
basekit_root=/opt/intel/oneapi
 | 
					basekit_root=/opt/intel/oneapi
 | 
				
			||||||
source $basekit_root/setvars.sh --force
 | 
					source $basekit_root/setvars.sh --force
 | 
				
			||||||
source $basekit_root/ccl/latest/env/vars.sh --force
 | 
					# source $basekit_root/ccl/latest/env/vars.sh --force   deprecate oneccl_bind_pt and use internal oneccl for better performance
 | 
				
			||||||
 | 
					source /opt/intel/1ccl-wks/setvars.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NUM_GPUS=2 # number of used GPU
 | 
					NUM_GPUS=2 # number of used GPU
 | 
				
			||||||
export USE_XETLA=OFF
 | 
					export USE_XETLA=OFF
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,7 +22,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 | 
				
			||||||
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 | 
					export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 | 
				
			||||||
basekit_root=/opt/intel/oneapi
 | 
					basekit_root=/opt/intel/oneapi
 | 
				
			||||||
source $basekit_root/setvars.sh --force
 | 
					source $basekit_root/setvars.sh --force
 | 
				
			||||||
source $basekit_root/ccl/latest/env/vars.sh --force
 | 
					# source $basekit_root/ccl/latest/env/vars.sh --force   deprecate oneccl_bind_pt and use internal oneccl for better performance
 | 
				
			||||||
 | 
					source /opt/intel/1ccl-wks/setvars.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NUM_GPUS=2 # number of used GPU
 | 
					NUM_GPUS=2 # number of used GPU
 | 
				
			||||||
export USE_XETLA=OFF
 | 
					export USE_XETLA=OFF
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue