ipex-llm/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/cpu.patch

diff --git a/README.md b/README.md
index 91e1719..1f6f26d 100755
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Llama 2

-We are unlocking the power of large language models. Our latest version of Llama is now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly.
+We are unlocking the power of large language models. Our latest version of Llama is now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly.

 This release includes model weights and starting code for pretrained and fine-tuned Llama language models — ranging from 7B to 70B parameters.

@@ -58,8 +58,6 @@ torchrun --nproc_per_node 1 example_chat_completion.py \
 - Adjust the `max_seq_len` and `max_batch_size` parameters as needed.
 - This example runs the [example_chat_completion.py](example_chat_completion.py) found in this repository but you can change that to a different .py file.

-It is also possible to test models without CUDA. For example, to run models on CPU, add an extra command line option `--backend cpu` to following examples. Number of threads can be set using the environment variable `NUM_THREADS`.
-
 ## Inference

 Different models require different model-parallel (MP) values:
@@ -116,7 +114,7 @@ See [MODEL_CARD.md](MODEL_CARD.md).

 ## License

-Our model and weights are licensed for both researchers and commercial entities, upholding the principles of openness. Our mission is to empower individuals, and industry through this opportunity, while fostering an environment of discovery and ethical AI advancements.
+Our model and weights are licensed for both researchers and commercial entities, upholding the principles of openness. Our mission is to empower individuals, and industry through this opportunity, while fostering an environment of discovery and ethical AI advancements.

 See the [LICENSE](LICENSE) file, as well as our accompanying [Acceptable Use Policy](USE_POLICY.md)

diff --git a/example_chat_completion.py b/example_chat_completion.py
index acedf44..df4e5d6 100644
--- a/example_chat_completion.py
+++ b/example_chat_completion.py
@@ -7,13 +7,10 @@ import fire

 from llama import Llama, Dialog

-from ipex_llm.optimize import optimize_model
-

 def main(
     ckpt_dir: str,
     tokenizer_path: str,
-    backend: str = 'cuda',
     temperature: float = 0.6,
     top_p: float = 0.9,
     max_seq_len: int = 512,
@@ -39,12 +36,9 @@ def main(
         ckpt_dir=ckpt_dir,
         tokenizer_path=tokenizer_path,
         max_seq_len=max_seq_len,
-        backend=backend,
         max_batch_size=max_batch_size,
     )

-    generator.model = optimize_model(generator.model)
-
     dialogs: List[Dialog] = [
         [{"role": "user", "content": "what is the recipe of mayonnaise?"}],
         [
diff --git a/example_text_completion.py b/example_text_completion.py
index 1f63bb0..0d60b9c 100755
--- a/example_text_completion.py
+++ b/example_text_completion.py
@@ -6,12 +6,9 @@ import fire
 from llama import Llama
 from typing import List

-from ipex_llm.optimize import optimize_model
-
 def main(
     ckpt_dir: str,
     tokenizer_path: str,
-    backend: str = 'cuda',
     temperature: float = 0.6,
     top_p: float = 0.9,
     max_seq_len: int = 128,
@@ -36,12 +33,9 @@ def main(
         ckpt_dir=ckpt_dir,
         tokenizer_path=tokenizer_path,
         max_seq_len=max_seq_len,
-        backend=backend,
         max_batch_size=max_batch_size,
     )

-    generator.model = optimize_model(generator.model)
-
     prompts: List[str] = [
         # For these prompts, the expected answer is the natural continuation of the prompt
         "I believe the meaning of life is",
@@ -49,11 +43,11 @@ def main(
         """A brief message congratulating the team on the launch:

         Hi everyone,
-
+
         I just """,
         # Few shot prompt (providing a few examples before asking model to complete more);
         """Translate English to French:
-
+
         sea otter => loutre de mer
         peppermint => menthe poivrée
         plush girafe => girafe peluche
diff --git a/llama/generation.py b/llama/generation.py
index df68aca..5f8faf9 100755
--- a/llama/generation.py
+++ b/llama/generation.py
@@ -55,7 +55,6 @@ class Llama:
         tokenizer_path: str,
         max_seq_len: int,
         max_batch_size: int,
-        backend: str,
         model_parallel_size: Optional[int] = None,
         seed: int = 1,
     ) -> "Llama":
@@ -82,41 +81,22 @@ class Llama:
             and loads the pre-trained model and tokenizer.

         """
-        if model_parallel_size is None:
-            model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
-
-        device = backend
-
-        if backend == 'cuda':
-            if not torch.distributed.is_initialized():
-                torch.distributed.init_process_group("nccl")
-            if not model_parallel_is_initialized():
-                initialize_model_parallel(model_parallel_size)
-            local_rank = int(os.environ.get("LOCAL_RANK", 0))
-            torch.cuda.set_device(local_rank)
-            if local_rank > 0:
-                sys.stdout = open(os.devnull, "w")
-            torch.set_default_tensor_type(torch.cuda.HalfTensor)
-        else:
-            torch.distributed.init_process_group("gloo")
-
+        if not torch.distributed.is_initialized():
+            torch.distributed.init_process_group("nccl")
+        if not model_parallel_is_initialized():
+            if model_parallel_size is None:
+                model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
             initialize_model_parallel(model_parallel_size)

-            if backend == 'directml':
-                import torch_directml
-                torch.set_default_tensor_type(torch_directml.torch.HalfTensor)
-                device = torch_directml.device()
-            elif backend == 'cpu':
-                # Note: some operations such as "addmm_impl_cpu_" are not implemented for 'Half' at present
-                # torch.set_default_tensor_type(torch.HalfTensor)
-                n_threads = int(os.environ.get("NUM_THREADS", 0))
-                if n_threads > 0:
-                    torch.set_num_threads(n_threads)
-                pass
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        torch.cuda.set_device(local_rank)

         # seed must be the same in all processes
         torch.manual_seed(seed)

+        if local_rank > 0:
+            sys.stdout = open(os.devnull, "w")
+
         start_time = time.time()
         checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
         assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
@@ -129,13 +109,13 @@ class Llama:
             params = json.loads(f.read())

         model_args: ModelArgs = ModelArgs(
-            device=device,
             max_seq_len=max_seq_len,
             max_batch_size=max_batch_size,
             **params,
         )
         tokenizer = Tokenizer(model_path=tokenizer_path)
         model_args.vocab_size = tokenizer.n_words
+        torch.set_default_tensor_type(torch.cuda.HalfTensor)
         model = Transformer(model_args)
         model.load_state_dict(checkpoint, strict=False)
         print(f"Loaded in {time.time() - start_time:.2f} seconds")
@@ -145,7 +125,6 @@ class Llama:
     def __init__(self, model: Transformer, tokenizer: Tokenizer):
         self.model = model
         self.tokenizer = tokenizer
-        self.device = model.device

     @torch.inference_mode()
     def generate(
@@ -186,14 +165,14 @@ class Llama:
         total_len = min(params.max_seq_len, max_gen_len + max_prompt_len)

         pad_id = self.tokenizer.pad_id
-        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device=self.device)
+        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
         for k, t in enumerate(prompt_tokens):
-            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device=self.device)
+            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
         if logprobs:
             token_logprobs = torch.zeros_like(tokens, dtype=torch.float)

         prev_pos = 0
-        eos_reached = torch.tensor([False] * bsz, device=self.device)
+        eos_reached = torch.tensor([False] * bsz, device="cuda")
         input_text_mask = tokens != pad_id
         if min_prompt_len == total_len:
             logits = self.model.forward(tokens, prev_pos)
diff --git a/llama/model.py b/llama/model.py
index 8646d31..770526d 100755
--- a/llama/model.py
+++ b/llama/model.py
@@ -9,28 +9,15 @@ import fairscale.nn.model_parallel.initialize as fs_init
 import torch
 import torch.nn.functional as F
 from fairscale.nn.model_parallel.layers import (
-    # ColumnParallelLinear,
+    ColumnParallelLinear,
     ParallelEmbedding,
-    # RowParallelLinear,
+    RowParallelLinear,
 )
 from torch import nn


-def ColumnParallelLinear(in_features: int, out_features: int, bias: bool = True, *args, **kwargs):
-    return torch.nn.Linear(in_features=in_features,
-                           out_features=out_features,
-                           bias=bias)
-
-
-def RowParallelLinear(in_features: int, out_features: int, bias: bool = True, *args, **kwargs):
-    return torch.nn.Linear(in_features=in_features,
-                           out_features=out_features,
-                           bias=bias)
-
-
 @dataclass
 class ModelArgs:
-    device: object
     dim: int = 4096
     n_layers: int = 32
     n_heads: int = 32
@@ -216,7 +203,6 @@ class Attention(nn.Module):
         self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
         self.n_rep = self.n_local_heads // self.n_local_kv_heads
         self.head_dim = args.dim // args.n_heads
-        self.device = args.device

         self.wq = ColumnParallelLinear(
             args.dim,
@@ -254,7 +240,7 @@ class Attention(nn.Module):
                 self.n_local_kv_heads,
                 self.head_dim,
             )
-        ).to(self.device)
+        ).cuda()
         self.cache_v = torch.zeros(
             (
                 args.max_batch_size,
@@ -262,7 +248,7 @@ class Attention(nn.Module):
                 self.n_local_kv_heads,
                 self.head_dim,
             )
-        ).to(self.device)
+        ).cuda()

     def forward(
         self,
@@ -447,7 +433,6 @@ class Transformer(nn.Module):
         self.params = params
         self.vocab_size = params.vocab_size
         self.n_layers = params.n_layers
-        self.device = params.device

         self.tok_embeddings = ParallelEmbedding(
             params.vocab_size, params.dim, init_method=lambda x: x