ipex-llm/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/cpu.patch
Wang, Jian4 9df70d95eb
Refactor bigdl.llm to ipex_llm (#24)
* Rename bigdl/llm to ipex_llm

* rm python/llm/src/bigdl

* from bigdl.llm to from ipex_llm
2024-03-22 15:41:21 +08:00

279 lines
11 KiB
Diff

diff --git a/README.md b/README.md
index 91e1719..1f6f26d 100755
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# Llama 2
-We are unlocking the power of large language models. Our latest version of Llama is now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly.
+We are unlocking the power of large language models. Our latest version of Llama is now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly.
This release includes model weights and starting code for pretrained and fine-tuned Llama language models — ranging from 7B to 70B parameters.
@@ -58,8 +58,6 @@ torchrun --nproc_per_node 1 example_chat_completion.py \
- Adjust the `max_seq_len` and `max_batch_size` parameters as needed.
- This example runs the [example_chat_completion.py](example_chat_completion.py) found in this repository but you can change that to a different .py file.
-It is also possible to test models without CUDA. For example, to run models on CPU, add an extra command line option `--backend cpu` to following examples. Number of threads can be set using the environment variable `NUM_THREADS`.
-
## Inference
Different models require different model-parallel (MP) values:
@@ -116,7 +114,7 @@ See [MODEL_CARD.md](MODEL_CARD.md).
## License
-Our model and weights are licensed for both researchers and commercial entities, upholding the principles of openness. Our mission is to empower individuals, and industry through this opportunity, while fostering an environment of discovery and ethical AI advancements.
+Our model and weights are licensed for both researchers and commercial entities, upholding the principles of openness. Our mission is to empower individuals, and industry through this opportunity, while fostering an environment of discovery and ethical AI advancements.
See the [LICENSE](LICENSE) file, as well as our accompanying [Acceptable Use Policy](USE_POLICY.md)
diff --git a/example_chat_completion.py b/example_chat_completion.py
index acedf44..df4e5d6 100644
--- a/example_chat_completion.py
+++ b/example_chat_completion.py
@@ -7,13 +7,10 @@ import fire
from llama import Llama, Dialog
-from ipex_llm.optimize import optimize_model
-
def main(
ckpt_dir: str,
tokenizer_path: str,
- backend: str = 'cuda',
temperature: float = 0.6,
top_p: float = 0.9,
max_seq_len: int = 512,
@@ -39,12 +36,9 @@ def main(
ckpt_dir=ckpt_dir,
tokenizer_path=tokenizer_path,
max_seq_len=max_seq_len,
- backend=backend,
max_batch_size=max_batch_size,
)
- generator.model = optimize_model(generator.model)
-
dialogs: List[Dialog] = [
[{"role": "user", "content": "what is the recipe of mayonnaise?"}],
[
diff --git a/example_text_completion.py b/example_text_completion.py
index 1f63bb0..0d60b9c 100755
--- a/example_text_completion.py
+++ b/example_text_completion.py
@@ -6,12 +6,9 @@ import fire
from llama import Llama
from typing import List
-from ipex_llm.optimize import optimize_model
-
def main(
ckpt_dir: str,
tokenizer_path: str,
- backend: str = 'cuda',
temperature: float = 0.6,
top_p: float = 0.9,
max_seq_len: int = 128,
@@ -36,12 +33,9 @@ def main(
ckpt_dir=ckpt_dir,
tokenizer_path=tokenizer_path,
max_seq_len=max_seq_len,
- backend=backend,
max_batch_size=max_batch_size,
)
- generator.model = optimize_model(generator.model)
-
prompts: List[str] = [
# For these prompts, the expected answer is the natural continuation of the prompt
"I believe the meaning of life is",
@@ -49,11 +43,11 @@ def main(
"""A brief message congratulating the team on the launch:
Hi everyone,
-
+
I just """,
# Few shot prompt (providing a few examples before asking model to complete more);
"""Translate English to French:
-
+
sea otter => loutre de mer
peppermint => menthe poivrée
plush girafe => girafe peluche
diff --git a/llama/generation.py b/llama/generation.py
index df68aca..5f8faf9 100755
--- a/llama/generation.py
+++ b/llama/generation.py
@@ -55,7 +55,6 @@ class Llama:
tokenizer_path: str,
max_seq_len: int,
max_batch_size: int,
- backend: str,
model_parallel_size: Optional[int] = None,
seed: int = 1,
) -> "Llama":
@@ -82,41 +81,22 @@ class Llama:
and loads the pre-trained model and tokenizer.
"""
- if model_parallel_size is None:
- model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
-
- device = backend
-
- if backend == 'cuda':
- if not torch.distributed.is_initialized():
- torch.distributed.init_process_group("nccl")
- if not model_parallel_is_initialized():
- initialize_model_parallel(model_parallel_size)
- local_rank = int(os.environ.get("LOCAL_RANK", 0))
- torch.cuda.set_device(local_rank)
- if local_rank > 0:
- sys.stdout = open(os.devnull, "w")
- torch.set_default_tensor_type(torch.cuda.HalfTensor)
- else:
- torch.distributed.init_process_group("gloo")
-
+ if not torch.distributed.is_initialized():
+ torch.distributed.init_process_group("nccl")
+ if not model_parallel_is_initialized():
+ if model_parallel_size is None:
+ model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
initialize_model_parallel(model_parallel_size)
- if backend == 'directml':
- import torch_directml
- torch.set_default_tensor_type(torch_directml.torch.HalfTensor)
- device = torch_directml.device()
- elif backend == 'cpu':
- # Note: some operations such as "addmm_impl_cpu_" are not implemented for 'Half' at present
- # torch.set_default_tensor_type(torch.HalfTensor)
- n_threads = int(os.environ.get("NUM_THREADS", 0))
- if n_threads > 0:
- torch.set_num_threads(n_threads)
- pass
+ local_rank = int(os.environ.get("LOCAL_RANK", 0))
+ torch.cuda.set_device(local_rank)
# seed must be the same in all processes
torch.manual_seed(seed)
+ if local_rank > 0:
+ sys.stdout = open(os.devnull, "w")
+
start_time = time.time()
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
@@ -129,13 +109,13 @@ class Llama:
params = json.loads(f.read())
model_args: ModelArgs = ModelArgs(
- device=device,
max_seq_len=max_seq_len,
max_batch_size=max_batch_size,
**params,
)
tokenizer = Tokenizer(model_path=tokenizer_path)
model_args.vocab_size = tokenizer.n_words
+ torch.set_default_tensor_type(torch.cuda.HalfTensor)
model = Transformer(model_args)
model.load_state_dict(checkpoint, strict=False)
print(f"Loaded in {time.time() - start_time:.2f} seconds")
@@ -145,7 +125,6 @@ class Llama:
def __init__(self, model: Transformer, tokenizer: Tokenizer):
self.model = model
self.tokenizer = tokenizer
- self.device = model.device
@torch.inference_mode()
def generate(
@@ -186,14 +165,14 @@ class Llama:
total_len = min(params.max_seq_len, max_gen_len + max_prompt_len)
pad_id = self.tokenizer.pad_id
- tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device=self.device)
+ tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
for k, t in enumerate(prompt_tokens):
- tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device=self.device)
+ tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
if logprobs:
token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
prev_pos = 0
- eos_reached = torch.tensor([False] * bsz, device=self.device)
+ eos_reached = torch.tensor([False] * bsz, device="cuda")
input_text_mask = tokens != pad_id
if min_prompt_len == total_len:
logits = self.model.forward(tokens, prev_pos)
diff --git a/llama/model.py b/llama/model.py
index 8646d31..770526d 100755
--- a/llama/model.py
+++ b/llama/model.py
@@ -9,28 +9,15 @@ import fairscale.nn.model_parallel.initialize as fs_init
import torch
import torch.nn.functional as F
from fairscale.nn.model_parallel.layers import (
- # ColumnParallelLinear,
+ ColumnParallelLinear,
ParallelEmbedding,
- # RowParallelLinear,
+ RowParallelLinear,
)
from torch import nn
-def ColumnParallelLinear(in_features: int, out_features: int, bias: bool = True, *args, **kwargs):
- return torch.nn.Linear(in_features=in_features,
- out_features=out_features,
- bias=bias)
-
-
-def RowParallelLinear(in_features: int, out_features: int, bias: bool = True, *args, **kwargs):
- return torch.nn.Linear(in_features=in_features,
- out_features=out_features,
- bias=bias)
-
-
@dataclass
class ModelArgs:
- device: object
dim: int = 4096
n_layers: int = 32
n_heads: int = 32
@@ -216,7 +203,6 @@ class Attention(nn.Module):
self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
self.n_rep = self.n_local_heads // self.n_local_kv_heads
self.head_dim = args.dim // args.n_heads
- self.device = args.device
self.wq = ColumnParallelLinear(
args.dim,
@@ -254,7 +240,7 @@ class Attention(nn.Module):
self.n_local_kv_heads,
self.head_dim,
)
- ).to(self.device)
+ ).cuda()
self.cache_v = torch.zeros(
(
args.max_batch_size,
@@ -262,7 +248,7 @@ class Attention(nn.Module):
self.n_local_kv_heads,
self.head_dim,
)
- ).to(self.device)
+ ).cuda()
def forward(
self,
@@ -447,7 +433,6 @@ class Transformer(nn.Module):
self.params = params
self.vocab_size = params.vocab_size
self.n_layers = params.n_layers
- self.device = params.device
self.tok_embeddings = ParallelEmbedding(
params.vocab_size, params.dim, init_method=lambda x: x