* Rename bigdl/llm to ipex_llm * rm python/llm/src/bigdl * from bigdl.llm to from ipex_llm
		
			
				
	
	
		
			279 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			279 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
diff --git a/README.md b/README.md
 | 
						|
index 91e1719..1f6f26d 100755
 | 
						|
--- a/README.md
 | 
						|
+++ b/README.md
 | 
						|
@@ -1,6 +1,6 @@
 | 
						|
 # Llama 2
 | 
						|
 
 | 
						|
-We are unlocking the power of large language models. Our latest version of Llama is now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly.
 | 
						|
+We are unlocking the power of large language models. Our latest version of Llama is now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly. 
 | 
						|
 
 | 
						|
 This release includes model weights and starting code for pretrained and fine-tuned Llama language models — ranging from 7B to 70B parameters.
 | 
						|
 
 | 
						|
@@ -58,8 +58,6 @@ torchrun --nproc_per_node 1 example_chat_completion.py \
 | 
						|
 - Adjust the `max_seq_len` and `max_batch_size` parameters as needed.
 | 
						|
 - This example runs the [example_chat_completion.py](example_chat_completion.py) found in this repository but you can change that to a different .py file.
 | 
						|
 
 | 
						|
-It is also possible to test models without CUDA. For example, to run models on CPU, add an extra command line option `--backend cpu` to following examples. Number of threads can be set using the environment variable `NUM_THREADS`.
 | 
						|
-
 | 
						|
 ## Inference
 | 
						|
 
 | 
						|
 Different models require different model-parallel (MP) values:
 | 
						|
@@ -116,7 +114,7 @@ See [MODEL_CARD.md](MODEL_CARD.md).
 | 
						|
 
 | 
						|
 ## License
 | 
						|
 
 | 
						|
-Our model and weights are licensed for both researchers and commercial entities, upholding the principles of openness. Our mission is to empower individuals, and industry through this opportunity, while fostering an environment of discovery and ethical AI advancements.
 | 
						|
+Our model and weights are licensed for both researchers and commercial entities, upholding the principles of openness. Our mission is to empower individuals, and industry through this opportunity, while fostering an environment of discovery and ethical AI advancements. 
 | 
						|
 
 | 
						|
 See the [LICENSE](LICENSE) file, as well as our accompanying [Acceptable Use Policy](USE_POLICY.md)
 | 
						|
 
 | 
						|
diff --git a/example_chat_completion.py b/example_chat_completion.py
 | 
						|
index acedf44..df4e5d6 100644
 | 
						|
--- a/example_chat_completion.py
 | 
						|
+++ b/example_chat_completion.py
 | 
						|
@@ -7,13 +7,10 @@ import fire
 | 
						|
 
 | 
						|
 from llama import Llama, Dialog
 | 
						|
 
 | 
						|
-from ipex_llm.optimize import optimize_model
 | 
						|
-
 | 
						|
 
 | 
						|
 def main(
 | 
						|
     ckpt_dir: str,
 | 
						|
     tokenizer_path: str,
 | 
						|
-    backend: str = 'cuda',
 | 
						|
     temperature: float = 0.6,
 | 
						|
     top_p: float = 0.9,
 | 
						|
     max_seq_len: int = 512,
 | 
						|
@@ -39,12 +36,9 @@ def main(
 | 
						|
         ckpt_dir=ckpt_dir,
 | 
						|
         tokenizer_path=tokenizer_path,
 | 
						|
         max_seq_len=max_seq_len,
 | 
						|
-        backend=backend,
 | 
						|
         max_batch_size=max_batch_size,
 | 
						|
     )
 | 
						|
 
 | 
						|
-    generator.model = optimize_model(generator.model)
 | 
						|
-
 | 
						|
     dialogs: List[Dialog] = [
 | 
						|
         [{"role": "user", "content": "what is the recipe of mayonnaise?"}],
 | 
						|
         [
 | 
						|
diff --git a/example_text_completion.py b/example_text_completion.py
 | 
						|
index 1f63bb0..0d60b9c 100755
 | 
						|
--- a/example_text_completion.py
 | 
						|
+++ b/example_text_completion.py
 | 
						|
@@ -6,12 +6,9 @@ import fire
 | 
						|
 from llama import Llama
 | 
						|
 from typing import List
 | 
						|
 
 | 
						|
-from ipex_llm.optimize import optimize_model
 | 
						|
-
 | 
						|
 def main(
 | 
						|
     ckpt_dir: str,
 | 
						|
     tokenizer_path: str,
 | 
						|
-    backend: str = 'cuda',
 | 
						|
     temperature: float = 0.6,
 | 
						|
     top_p: float = 0.9,
 | 
						|
     max_seq_len: int = 128,
 | 
						|
@@ -36,12 +33,9 @@ def main(
 | 
						|
         ckpt_dir=ckpt_dir,
 | 
						|
         tokenizer_path=tokenizer_path,
 | 
						|
         max_seq_len=max_seq_len,
 | 
						|
-        backend=backend,
 | 
						|
         max_batch_size=max_batch_size,
 | 
						|
     )
 | 
						|
 
 | 
						|
-    generator.model = optimize_model(generator.model)
 | 
						|
-
 | 
						|
     prompts: List[str] = [
 | 
						|
         # For these prompts, the expected answer is the natural continuation of the prompt
 | 
						|
         "I believe the meaning of life is",
 | 
						|
@@ -49,11 +43,11 @@ def main(
 | 
						|
         """A brief message congratulating the team on the launch:
 | 
						|
 
 | 
						|
         Hi everyone,
 | 
						|
-
 | 
						|
+        
 | 
						|
         I just """,
 | 
						|
         # Few shot prompt (providing a few examples before asking model to complete more);
 | 
						|
         """Translate English to French:
 | 
						|
-
 | 
						|
+        
 | 
						|
         sea otter => loutre de mer
 | 
						|
         peppermint => menthe poivrée
 | 
						|
         plush girafe => girafe peluche
 | 
						|
diff --git a/llama/generation.py b/llama/generation.py
 | 
						|
index df68aca..5f8faf9 100755
 | 
						|
--- a/llama/generation.py
 | 
						|
+++ b/llama/generation.py
 | 
						|
@@ -55,7 +55,6 @@ class Llama:
 | 
						|
         tokenizer_path: str,
 | 
						|
         max_seq_len: int,
 | 
						|
         max_batch_size: int,
 | 
						|
-        backend: str,
 | 
						|
         model_parallel_size: Optional[int] = None,
 | 
						|
         seed: int = 1,
 | 
						|
     ) -> "Llama":
 | 
						|
@@ -82,41 +81,22 @@ class Llama:
 | 
						|
             and loads the pre-trained model and tokenizer.
 | 
						|
 
 | 
						|
         """
 | 
						|
-        if model_parallel_size is None:
 | 
						|
-            model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
 | 
						|
-
 | 
						|
-        device = backend
 | 
						|
-
 | 
						|
-        if backend == 'cuda':
 | 
						|
-            if not torch.distributed.is_initialized():
 | 
						|
-                torch.distributed.init_process_group("nccl")
 | 
						|
-            if not model_parallel_is_initialized():
 | 
						|
-                initialize_model_parallel(model_parallel_size)
 | 
						|
-            local_rank = int(os.environ.get("LOCAL_RANK", 0))
 | 
						|
-            torch.cuda.set_device(local_rank)
 | 
						|
-            if local_rank > 0:
 | 
						|
-                sys.stdout = open(os.devnull, "w")
 | 
						|
-            torch.set_default_tensor_type(torch.cuda.HalfTensor)
 | 
						|
-        else:
 | 
						|
-            torch.distributed.init_process_group("gloo")
 | 
						|
-
 | 
						|
+        if not torch.distributed.is_initialized():
 | 
						|
+            torch.distributed.init_process_group("nccl")
 | 
						|
+        if not model_parallel_is_initialized():
 | 
						|
+            if model_parallel_size is None:
 | 
						|
+                model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
 | 
						|
             initialize_model_parallel(model_parallel_size)
 | 
						|
 
 | 
						|
-            if backend == 'directml':
 | 
						|
-                import torch_directml
 | 
						|
-                torch.set_default_tensor_type(torch_directml.torch.HalfTensor)
 | 
						|
-                device = torch_directml.device()
 | 
						|
-            elif backend == 'cpu':
 | 
						|
-                # Note: some operations such as "addmm_impl_cpu_" are not implemented for 'Half' at present
 | 
						|
-                # torch.set_default_tensor_type(torch.HalfTensor)
 | 
						|
-                n_threads = int(os.environ.get("NUM_THREADS", 0))
 | 
						|
-                if n_threads > 0:
 | 
						|
-                    torch.set_num_threads(n_threads)
 | 
						|
-                pass
 | 
						|
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
 | 
						|
+        torch.cuda.set_device(local_rank)
 | 
						|
 
 | 
						|
         # seed must be the same in all processes
 | 
						|
         torch.manual_seed(seed)
 | 
						|
 
 | 
						|
+        if local_rank > 0:
 | 
						|
+            sys.stdout = open(os.devnull, "w")
 | 
						|
+
 | 
						|
         start_time = time.time()
 | 
						|
         checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
 | 
						|
         assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
 | 
						|
@@ -129,13 +109,13 @@ class Llama:
 | 
						|
             params = json.loads(f.read())
 | 
						|
 
 | 
						|
         model_args: ModelArgs = ModelArgs(
 | 
						|
-            device=device,
 | 
						|
             max_seq_len=max_seq_len,
 | 
						|
             max_batch_size=max_batch_size,
 | 
						|
             **params,
 | 
						|
         )
 | 
						|
         tokenizer = Tokenizer(model_path=tokenizer_path)
 | 
						|
         model_args.vocab_size = tokenizer.n_words
 | 
						|
+        torch.set_default_tensor_type(torch.cuda.HalfTensor)
 | 
						|
         model = Transformer(model_args)
 | 
						|
         model.load_state_dict(checkpoint, strict=False)
 | 
						|
         print(f"Loaded in {time.time() - start_time:.2f} seconds")
 | 
						|
@@ -145,7 +125,6 @@ class Llama:
 | 
						|
     def __init__(self, model: Transformer, tokenizer: Tokenizer):
 | 
						|
         self.model = model
 | 
						|
         self.tokenizer = tokenizer
 | 
						|
-        self.device = model.device
 | 
						|
 
 | 
						|
     @torch.inference_mode()
 | 
						|
     def generate(
 | 
						|
@@ -186,14 +165,14 @@ class Llama:
 | 
						|
         total_len = min(params.max_seq_len, max_gen_len + max_prompt_len)
 | 
						|
 
 | 
						|
         pad_id = self.tokenizer.pad_id
 | 
						|
-        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device=self.device)
 | 
						|
+        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
 | 
						|
         for k, t in enumerate(prompt_tokens):
 | 
						|
-            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device=self.device)
 | 
						|
+            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
 | 
						|
         if logprobs:
 | 
						|
             token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
 | 
						|
 
 | 
						|
         prev_pos = 0
 | 
						|
-        eos_reached = torch.tensor([False] * bsz, device=self.device)
 | 
						|
+        eos_reached = torch.tensor([False] * bsz, device="cuda")
 | 
						|
         input_text_mask = tokens != pad_id
 | 
						|
         if min_prompt_len == total_len:
 | 
						|
             logits = self.model.forward(tokens, prev_pos)
 | 
						|
diff --git a/llama/model.py b/llama/model.py
 | 
						|
index 8646d31..770526d 100755
 | 
						|
--- a/llama/model.py
 | 
						|
+++ b/llama/model.py
 | 
						|
@@ -9,28 +9,15 @@ import fairscale.nn.model_parallel.initialize as fs_init
 | 
						|
 import torch
 | 
						|
 import torch.nn.functional as F
 | 
						|
 from fairscale.nn.model_parallel.layers import (
 | 
						|
-    # ColumnParallelLinear,
 | 
						|
+    ColumnParallelLinear,
 | 
						|
     ParallelEmbedding,
 | 
						|
-    # RowParallelLinear,
 | 
						|
+    RowParallelLinear,
 | 
						|
 )
 | 
						|
 from torch import nn
 | 
						|
 
 | 
						|
 
 | 
						|
-def ColumnParallelLinear(in_features: int, out_features: int, bias: bool = True, *args, **kwargs):
 | 
						|
-    return torch.nn.Linear(in_features=in_features,
 | 
						|
-                           out_features=out_features,
 | 
						|
-                           bias=bias)
 | 
						|
-
 | 
						|
-
 | 
						|
-def RowParallelLinear(in_features: int, out_features: int, bias: bool = True, *args, **kwargs):
 | 
						|
-    return torch.nn.Linear(in_features=in_features,
 | 
						|
-                           out_features=out_features,
 | 
						|
-                           bias=bias)
 | 
						|
-
 | 
						|
-
 | 
						|
 @dataclass
 | 
						|
 class ModelArgs:
 | 
						|
-    device: object
 | 
						|
     dim: int = 4096
 | 
						|
     n_layers: int = 32
 | 
						|
     n_heads: int = 32
 | 
						|
@@ -216,7 +203,6 @@ class Attention(nn.Module):
 | 
						|
         self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
 | 
						|
         self.n_rep = self.n_local_heads // self.n_local_kv_heads
 | 
						|
         self.head_dim = args.dim // args.n_heads
 | 
						|
-        self.device = args.device
 | 
						|
 
 | 
						|
         self.wq = ColumnParallelLinear(
 | 
						|
             args.dim,
 | 
						|
@@ -254,7 +240,7 @@ class Attention(nn.Module):
 | 
						|
                 self.n_local_kv_heads,
 | 
						|
                 self.head_dim,
 | 
						|
             )
 | 
						|
-        ).to(self.device)
 | 
						|
+        ).cuda()
 | 
						|
         self.cache_v = torch.zeros(
 | 
						|
             (
 | 
						|
                 args.max_batch_size,
 | 
						|
@@ -262,7 +248,7 @@ class Attention(nn.Module):
 | 
						|
                 self.n_local_kv_heads,
 | 
						|
                 self.head_dim,
 | 
						|
             )
 | 
						|
-        ).to(self.device)
 | 
						|
+        ).cuda()
 | 
						|
 
 | 
						|
     def forward(
 | 
						|
         self,
 | 
						|
@@ -447,7 +433,6 @@ class Transformer(nn.Module):
 | 
						|
         self.params = params
 | 
						|
         self.vocab_size = params.vocab_size
 | 
						|
         self.n_layers = params.n_layers
 | 
						|
-        self.device = params.device
 | 
						|
 
 | 
						|
         self.tok_embeddings = ParallelEmbedding(
 | 
						|
             params.vocab_size, params.dim, init_method=lambda x: x
 |