diff --git a/README.md b/README.md index 91e1719..1f6f26d 100755 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Llama 2 -We are unlocking the power of large language models. Our latest version of Llama is now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly. +We are unlocking the power of large language models. Our latest version of Llama is now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly. This release includes model weights and starting code for pretrained and fine-tuned Llama language models — ranging from 7B to 70B parameters. @@ -58,8 +58,6 @@ torchrun --nproc_per_node 1 example_chat_completion.py \ - Adjust the `max_seq_len` and `max_batch_size` parameters as needed. - This example runs the [example_chat_completion.py](example_chat_completion.py) found in this repository but you can change that to a different .py file. -It is also possible to test models without CUDA. For example, to run models on CPU, add an extra command line option `--backend cpu` to following examples. Number of threads can be set using the environment variable `NUM_THREADS`. - ## Inference Different models require different model-parallel (MP) values: @@ -116,7 +114,7 @@ See [MODEL_CARD.md](MODEL_CARD.md). ## License -Our model and weights are licensed for both researchers and commercial entities, upholding the principles of openness. Our mission is to empower individuals, and industry through this opportunity, while fostering an environment of discovery and ethical AI advancements. +Our model and weights are licensed for both researchers and commercial entities, upholding the principles of openness. Our mission is to empower individuals, and industry through this opportunity, while fostering an environment of discovery and ethical AI advancements. See the [LICENSE](LICENSE) file, as well as our accompanying [Acceptable Use Policy](USE_POLICY.md) diff --git a/example_chat_completion.py b/example_chat_completion.py index acedf44..df4e5d6 100644 --- a/example_chat_completion.py +++ b/example_chat_completion.py @@ -7,13 +7,10 @@ import fire from llama import Llama, Dialog -from bigdl.llm.optimize import optimize_model - def main( ckpt_dir: str, tokenizer_path: str, - backend: str = 'cuda', temperature: float = 0.6, top_p: float = 0.9, max_seq_len: int = 512, @@ -39,12 +36,9 @@ def main( ckpt_dir=ckpt_dir, tokenizer_path=tokenizer_path, max_seq_len=max_seq_len, - backend=backend, max_batch_size=max_batch_size, ) - generator.model = optimize_model(generator.model) - dialogs: List[Dialog] = [ [{"role": "user", "content": "what is the recipe of mayonnaise?"}], [ diff --git a/example_text_completion.py b/example_text_completion.py index 1f63bb0..0d60b9c 100755 --- a/example_text_completion.py +++ b/example_text_completion.py @@ -6,12 +6,9 @@ import fire from llama import Llama from typing import List -from bigdl.llm.optimize import optimize_model - def main( ckpt_dir: str, tokenizer_path: str, - backend: str = 'cuda', temperature: float = 0.6, top_p: float = 0.9, max_seq_len: int = 128, @@ -36,12 +33,9 @@ def main( ckpt_dir=ckpt_dir, tokenizer_path=tokenizer_path, max_seq_len=max_seq_len, - backend=backend, max_batch_size=max_batch_size, ) - generator.model = optimize_model(generator.model) - prompts: List[str] = [ # For these prompts, the expected answer is the natural continuation of the prompt "I believe the meaning of life is", @@ -49,11 +43,11 @@ def main( """A brief message congratulating the team on the launch: Hi everyone, - + I just """, # Few shot prompt (providing a few examples before asking model to complete more); """Translate English to French: - + sea otter => loutre de mer peppermint => menthe poivrée plush girafe => girafe peluche diff --git a/llama/generation.py b/llama/generation.py index df68aca..5f8faf9 100755 --- a/llama/generation.py +++ b/llama/generation.py @@ -55,7 +55,6 @@ class Llama: tokenizer_path: str, max_seq_len: int, max_batch_size: int, - backend: str, model_parallel_size: Optional[int] = None, seed: int = 1, ) -> "Llama": @@ -82,41 +81,22 @@ class Llama: and loads the pre-trained model and tokenizer. """ - if model_parallel_size is None: - model_parallel_size = int(os.environ.get("WORLD_SIZE", 1)) - - device = backend - - if backend == 'cuda': - if not torch.distributed.is_initialized(): - torch.distributed.init_process_group("nccl") - if not model_parallel_is_initialized(): - initialize_model_parallel(model_parallel_size) - local_rank = int(os.environ.get("LOCAL_RANK", 0)) - torch.cuda.set_device(local_rank) - if local_rank > 0: - sys.stdout = open(os.devnull, "w") - torch.set_default_tensor_type(torch.cuda.HalfTensor) - else: - torch.distributed.init_process_group("gloo") - + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group("nccl") + if not model_parallel_is_initialized(): + if model_parallel_size is None: + model_parallel_size = int(os.environ.get("WORLD_SIZE", 1)) initialize_model_parallel(model_parallel_size) - if backend == 'directml': - import torch_directml - torch.set_default_tensor_type(torch_directml.torch.HalfTensor) - device = torch_directml.device() - elif backend == 'cpu': - # Note: some operations such as "addmm_impl_cpu_" are not implemented for 'Half' at present - # torch.set_default_tensor_type(torch.HalfTensor) - n_threads = int(os.environ.get("NUM_THREADS", 0)) - if n_threads > 0: - torch.set_num_threads(n_threads) - pass + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + torch.cuda.set_device(local_rank) # seed must be the same in all processes torch.manual_seed(seed) + if local_rank > 0: + sys.stdout = open(os.devnull, "w") + start_time = time.time() checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}" @@ -129,13 +109,13 @@ class Llama: params = json.loads(f.read()) model_args: ModelArgs = ModelArgs( - device=device, max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params, ) tokenizer = Tokenizer(model_path=tokenizer_path) model_args.vocab_size = tokenizer.n_words + torch.set_default_tensor_type(torch.cuda.HalfTensor) model = Transformer(model_args) model.load_state_dict(checkpoint, strict=False) print(f"Loaded in {time.time() - start_time:.2f} seconds") @@ -145,7 +125,6 @@ class Llama: def __init__(self, model: Transformer, tokenizer: Tokenizer): self.model = model self.tokenizer = tokenizer - self.device = model.device @torch.inference_mode() def generate( @@ -186,14 +165,14 @@ class Llama: total_len = min(params.max_seq_len, max_gen_len + max_prompt_len) pad_id = self.tokenizer.pad_id - tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device=self.device) + tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda") for k, t in enumerate(prompt_tokens): - tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device=self.device) + tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda") if logprobs: token_logprobs = torch.zeros_like(tokens, dtype=torch.float) prev_pos = 0 - eos_reached = torch.tensor([False] * bsz, device=self.device) + eos_reached = torch.tensor([False] * bsz, device="cuda") input_text_mask = tokens != pad_id if min_prompt_len == total_len: logits = self.model.forward(tokens, prev_pos) diff --git a/llama/model.py b/llama/model.py index 8646d31..770526d 100755 --- a/llama/model.py +++ b/llama/model.py @@ -9,28 +9,15 @@ import fairscale.nn.model_parallel.initialize as fs_init import torch import torch.nn.functional as F from fairscale.nn.model_parallel.layers import ( - # ColumnParallelLinear, + ColumnParallelLinear, ParallelEmbedding, - # RowParallelLinear, + RowParallelLinear, ) from torch import nn -def ColumnParallelLinear(in_features: int, out_features: int, bias: bool = True, *args, **kwargs): - return torch.nn.Linear(in_features=in_features, - out_features=out_features, - bias=bias) - - -def RowParallelLinear(in_features: int, out_features: int, bias: bool = True, *args, **kwargs): - return torch.nn.Linear(in_features=in_features, - out_features=out_features, - bias=bias) - - @dataclass class ModelArgs: - device: object dim: int = 4096 n_layers: int = 32 n_heads: int = 32 @@ -216,7 +203,6 @@ class Attention(nn.Module): self.n_local_kv_heads = self.n_kv_heads // model_parallel_size self.n_rep = self.n_local_heads // self.n_local_kv_heads self.head_dim = args.dim // args.n_heads - self.device = args.device self.wq = ColumnParallelLinear( args.dim, @@ -254,7 +240,7 @@ class Attention(nn.Module): self.n_local_kv_heads, self.head_dim, ) - ).to(self.device) + ).cuda() self.cache_v = torch.zeros( ( args.max_batch_size, @@ -262,7 +248,7 @@ class Attention(nn.Module): self.n_local_kv_heads, self.head_dim, ) - ).to(self.device) + ).cuda() def forward( self, @@ -447,7 +433,6 @@ class Transformer(nn.Module): self.params = params self.vocab_size = params.vocab_size self.n_layers = params.n_layers - self.device = params.device self.tok_embeddings = ParallelEmbedding( params.vocab_size, params.dim, init_method=lambda x: x