Refine continuation get input_str (#11652)
* Remove duplicate code in continuation get input_str. * Avoid infinite loop in all-in-one due to test_length not in the list.
This commit is contained in:
		
							parent
							
								
									2fbd375a94
								
							
						
					
					
						commit
						0c6e0b86c0
					
				
					 1 changed files with 35 additions and 150 deletions
				
			
		| 
						 | 
				
			
			@ -74,6 +74,19 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in
 | 
			
		|||
                result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
 | 
			
		||||
                                        actual_in_len, actual_out_len, load_time, model.peak_memory])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_continuation_input_str(in_len):
 | 
			
		||||
    # in_len.txt maybe shorter than we need,
 | 
			
		||||
    # use much longer context to make sure input length
 | 
			
		||||
    test_length = min(in_len*2, 8192)
 | 
			
		||||
    while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192:
 | 
			
		||||
        test_length = test_length * 2
 | 
			
		||||
    # Force the test_length to be 8192, such that we can use 8192.txt
 | 
			
		||||
    if test_length > 8192:
 | 
			
		||||
        test_length = 8192
 | 
			
		||||
    return open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def preprocess_prompt(tokenizer, in_len, task):
 | 
			
		||||
    if task == 'summarize':
 | 
			
		||||
        if in_len == 512:
 | 
			
		||||
| 
						 | 
				
			
			@ -218,7 +231,7 @@ def run_native_int4(repo_id,
 | 
			
		|||
        in_out_len = in_out.split("-")
 | 
			
		||||
        in_len = int(in_out_len[0])
 | 
			
		||||
        out_len = int(in_out_len[1])
 | 
			
		||||
        input_str = open(f"prompt/continuation/{in_len}.txt", 'r').read()
 | 
			
		||||
        input_str = get_continuation_input_str(in_len)
 | 
			
		||||
        # As different tokenizer has different encodings,
 | 
			
		||||
        # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
        n_ctx = in_len + out_len if in_len + out_len > 512 else 512
 | 
			
		||||
| 
						 | 
				
			
			@ -274,13 +287,7 @@ def run_transformer_int4(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -339,13 +346,7 @@ def run_pytorch_autocast_bf16(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -410,13 +411,7 @@ def run_optimize_model(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -525,15 +520,7 @@ def run_transformer_int4_gpu(repo_id,
 | 
			
		|||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            if task == 'continuation':
 | 
			
		||||
                # As different tokenizer has different encodings,
 | 
			
		||||
                # in_len.txt maybe shorter than we need,
 | 
			
		||||
                # use much longer context to make sure input length
 | 
			
		||||
                test_length = min(in_len*2, 8192)
 | 
			
		||||
                while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192:
 | 
			
		||||
                    test_length = test_length * 2
 | 
			
		||||
                # For the sequence length not in [32, 256, 1024, 2048, 8192], it will be truncated from 8192.txt.
 | 
			
		||||
                test_length = min(test_length, 8192)
 | 
			
		||||
                input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
                input_str = get_continuation_input_str(in_len)
 | 
			
		||||
                # As different tokenizer has different encodings,
 | 
			
		||||
                # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
                input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -615,13 +602,7 @@ def transformers_int4_npu_win(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -692,13 +673,7 @@ def run_optimize_model_gpu(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -763,13 +738,7 @@ def run_ipex_fp16_gpu(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -840,13 +809,7 @@ def run_bigdl_fp16_gpu(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -932,13 +895,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -1031,13 +988,7 @@ def run_transformer_int4_gpu_win(repo_id,
 | 
			
		|||
                in_out_len = in_out.split("-")
 | 
			
		||||
                in_len = int(in_out_len[0])
 | 
			
		||||
                out_len = int(in_out_len[1])
 | 
			
		||||
                # As different tokenizer has different encodings,
 | 
			
		||||
                # in_len.txt maybe shorter than we need,
 | 
			
		||||
                # use much longer context to make sure input length
 | 
			
		||||
                test_length = min(in_len*2, 8192)
 | 
			
		||||
                while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                    test_length = test_length * 2
 | 
			
		||||
                input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
                input_str = get_continuation_input_str(in_len)
 | 
			
		||||
                # As different tokenizer has different encodings,
 | 
			
		||||
                # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
                input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -1154,13 +1105,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
 | 
			
		|||
                in_out_len = in_out.split("-")
 | 
			
		||||
                in_len = int(in_out_len[0])
 | 
			
		||||
                out_len = int(in_out_len[1])
 | 
			
		||||
                # As different tokenizer has different encodings,
 | 
			
		||||
                # in_len.txt maybe shorter than we need,
 | 
			
		||||
                # use much longer context to make sure input length
 | 
			
		||||
                test_length = min(in_len*2, 8192)
 | 
			
		||||
                while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                    test_length = test_length * 2
 | 
			
		||||
                input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
                input_str = get_continuation_input_str(in_len)
 | 
			
		||||
                # As different tokenizer has different encodings,
 | 
			
		||||
                # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
                input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -1270,13 +1215,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
 | 
			
		|||
                in_out_len = in_out.split("-")
 | 
			
		||||
                in_len = int(in_out_len[0])
 | 
			
		||||
                out_len = int(in_out_len[1])
 | 
			
		||||
                # As different tokenizer has different encodings,
 | 
			
		||||
                # in_len.txt maybe shorter than we need,
 | 
			
		||||
                # use much longer context to make sure input length
 | 
			
		||||
                test_length = min(in_len*2, 8192)
 | 
			
		||||
                while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                    test_length = test_length * 2
 | 
			
		||||
                input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
                input_str = get_continuation_input_str(in_len)
 | 
			
		||||
                # As different tokenizer has different encodings,
 | 
			
		||||
                # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
                input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -1386,13 +1325,7 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
 | 
			
		|||
                in_out_len = in_out.split("-")
 | 
			
		||||
                in_len = int(in_out_len[0])
 | 
			
		||||
                out_len = int(in_out_len[1])
 | 
			
		||||
                # As different tokenizer has different encodings,
 | 
			
		||||
                # in_len.txt maybe shorter than we need,
 | 
			
		||||
                # use much longer context to make sure input length
 | 
			
		||||
                test_length = min(in_len*2, 8192)
 | 
			
		||||
                while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                    test_length = test_length * 2
 | 
			
		||||
                input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
                input_str = get_continuation_input_str(in_len)
 | 
			
		||||
                # As different tokenizer has different encodings,
 | 
			
		||||
                # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
                input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -1475,13 +1408,7 @@ def run_transformer_autocast_bf16( repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -1546,13 +1473,7 @@ def run_bigdl_ipex_bf16(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -1616,13 +1537,7 @@ def run_bigdl_ipex_int4(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -1686,13 +1601,7 @@ def run_bigdl_ipex_int8(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -1796,13 +1705,7 @@ def run_deepspeed_optimize_model_gpu(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -1873,13 +1776,7 @@ def run_speculative_cpu(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -1951,13 +1848,7 @@ def run_speculative_gpu(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			@ -2038,13 +1929,7 @@ def run_pipeline_parallel_gpu(repo_id,
 | 
			
		|||
            in_out_len = in_out.split("-")
 | 
			
		||||
            in_len = int(in_out_len[0])
 | 
			
		||||
            out_len = int(in_out_len[1])
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # in_len.txt maybe shorter than we need,
 | 
			
		||||
            # use much longer context to make sure input length
 | 
			
		||||
            test_length = min(in_len*2, 8192)
 | 
			
		||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
			
		||||
                test_length = test_length * 2
 | 
			
		||||
            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
 | 
			
		||||
            input_str = get_continuation_input_str(in_len)
 | 
			
		||||
            # As different tokenizer has different encodings,
 | 
			
		||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
			
		||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue