From d3b9a48ba995089feaf5bfb5a21f8370bddfe5de Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Thu, 4 Jan 2024 22:12:53 +0800 Subject: [PATCH 1/9] support adapter finetune --- projects/Llama/adapter/adapter_config.py | 64 ++ projects/Llama/adapter/adapter_model.py | 717 +++++++++++++++++++++++ projects/Llama/adapter/adapter_sft.py | 105 ++++ projects/Llama/adapter/dataset.py | 65 ++ projects/Llama/adapter/train_net.py | 115 ++++ projects/Llama/configs/llama_config.py | 1 - projects/Llama/dataset.py | 15 +- projects/Llama/llama.py | 23 +- projects/Llama/utils/llama_loader.py | 3 - 9 files changed, 1085 insertions(+), 23 deletions(-) create mode 100644 projects/Llama/adapter/adapter_config.py create mode 100644 projects/Llama/adapter/adapter_model.py create mode 100644 projects/Llama/adapter/adapter_sft.py create mode 100644 projects/Llama/adapter/dataset.py create mode 100644 projects/Llama/adapter/train_net.py diff --git a/projects/Llama/adapter/adapter_config.py b/projects/Llama/adapter/adapter_config.py new file mode 100644 index 000000000..9732fad47 --- /dev/null +++ b/projects/Llama/adapter/adapter_config.py @@ -0,0 +1,64 @@ +from omegaconf import DictConfig, OmegaConf + +from libai.config import LazyCall +from projects.Llama.adapter.adapter_model import LlamaForCausalLM +from projects.Llama.tokenizer import LlamaTokenizer +from configs.common.train import train + + +cfg = dict( + # Model + hidden_act="silu", + hidden_size=4096, + initializer_range=0.02, + intermediate_size=11008, + max_position_embeddings=4096, + num_attention_heads=32, + hidden_layers=32, + pretraining_tp=1, + rms_norm_eps=1e-05, + rope_scaling=None, + tie_word_embeddings=False, + vocab_size=32000, + use_scaled_init_for_output_weights=False, + scale_mask_softmax_fusion=False, + amp_enabled=True, + # Inference + is_encoder_decoder=False, + max_length=256, + min_length=0, + do_sample=False, + early_stopping=False, + num_beams=1, + num_beam_groups=1, + diversity_penalty=0.0, + temperature=0.9, + top_k=50, + top_p=0.6, + typical_p=1.0, + repetition_penalty=1.0, + length_penalty=1.0, + no_repeat_ngram_size=0, + encoder_no_repeat_ngram_size=0, + num_return_sequences=1, + chunk_size_feed_forward=0, + output_scores=False, + use_cache=True, + bos_token_id=1, + eos_token_id=2, + pad_token_id=0, + # adapter + adapter_len=10, + adapter_layer=30, + # train + pretrained_model_path="/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/", +) + +cfg = DictConfig(cfg) + +model = LazyCall(LlamaForCausalLM)(cfg=cfg) +tokenization = OmegaConf.create() +tokenization.make_vocab_size_divisible_by = 1 +tokenization.tokenizer = LazyCall(LlamaTokenizer)( + pretrained_model_path="/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/tokenizer.model" +) diff --git a/projects/Llama/adapter/adapter_model.py b/projects/Llama/adapter/adapter_model.py new file mode 100644 index 000000000..0c7982be9 --- /dev/null +++ b/projects/Llama/adapter/adapter_model.py @@ -0,0 +1,717 @@ +# coding=utf-8 +# Copyright 2021 The OneFlow Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Tuple + +import oneflow as flow +from oneflow import nn +import oneflow.nn.functional as F + +from libai.config import configurable +from libai.inference.generator.generation_utils import Generator +from libai.layers import Linear, RMSLayerNorm, VocabEmbedding, Embedding +from libai.layers.attention import AttnMaskType +from libai.models.utils import init_method_normal, scaled_init_method_normal +from libai.utils import distributed as dist + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return flow.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + cos = cos[position_ids].unsqueeze(1) + sin = sin[position_ids].unsqueeze(1) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + + def forward(self, x, seq_len=None, cos_cached=None, sin_cached=None): + if seq_len > self.max_position_embeddings: + raise ValueError( + f"The maximum supported length is {self.max_position_embeddings}, " + f"and the current length is{seq_len}." + ) + + return ( + cos_cached[:seq_len].to_global(placement=x.placement), + sin_cached[:seq_len].to_global(placement=x.placement), + ) + + +class MLP(nn.Module): + def __init__( + self, + hidden_size, + intermediate_size, + init_method=nn.init.xavier_normal_, + output_layer_init_method=None, + *, + layer_idx=0, + ): + super().__init__() + + if output_layer_init_method is None: + output_layer_init_method = init_method + + self.gate_proj = Linear( + hidden_size, + intermediate_size, + bias=False, + parallel="col", + init_method=init_method, + layer_idx=layer_idx, + ) + + self.up_proj = Linear( + hidden_size, + intermediate_size, + bias=False, + parallel="col", + init_method=init_method, + layer_idx=layer_idx, + ) + + self.down_proj = Linear( + intermediate_size, + hidden_size, + bias=False, + parallel="row", + init_method=output_layer_init_method, + layer_idx=layer_idx, + ) + + self.activation_func = nn.SiLU() + + def forward(self, hidden_states): + gate_out = self.activation_func(self.gate_proj(hidden_states)) + up_out = self.up_proj(hidden_states) + output = self.down_proj(gate_out * up_out) + return output + + +class MultiheadAttention(nn.Module): + def __init__( + self, + hidden_size, + num_attention_heads, + max_position_embeddings, + init_method=nn.init.xavier_normal_, + output_layer_init_method=None, + scale_mask_softmax_fusion=False, + attn_mask_type=AttnMaskType.padding, + *, + layer_idx=0, + ): + super().__init__() + self.hidden_size = hidden_size + if output_layer_init_method is None: + output_layer_init_method = init_method + + self.num_heads = num_attention_heads + self.head_size = hidden_size // num_attention_heads + self.attn_mask_type = attn_mask_type + + self.norm_factor = 1.0 / math.sqrt(float(self.head_size)) + + self.scale_mask_softmax_fusion = scale_mask_softmax_fusion + + self.query_key_value = Linear( + self.hidden_size, + self.hidden_size * 3, + bias=False, + parallel="col", + init_method=init_method, + layer_idx=layer_idx, + ) + + self.o_proj = Linear( + self.hidden_size, + self.hidden_size, + bias=False, + parallel="row", + init_method=output_layer_init_method, + layer_idx=layer_idx, + ) + + self.coeff = None + + rotary_dim = self.head_size + self.rotary_embed = RotaryEmbedding( + dim=rotary_dim, + max_position_embeddings=max_position_embeddings, + ) + + self.gate = flow.nn.Parameter( + flow.zeros( + 1, self.num_heads, 1, 1, + placement=dist.get_layer_placement(layer_idx), + sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), + ) + ) + + def forward( + self, + hidden_states: flow.Tensor, + encoder_states: flow.Tensor = None, + attention_mask: flow.Tensor = None, + position_ids=None, + past_key_value: Tuple[flow.Tensor, flow.Tensor] = None, + cos_cached: flow.Tensor = None, + sin_cached: flow.Tensor = None, + use_cache: bool = False, + adapter = None, + ): + if encoder_states is not None: + encoder_states = encoder_states.to_global(placement=hidden_states.placement) + + if attention_mask is not None: + attention_mask = attention_mask.to_global(placement=hidden_states.placement) + + if adapter is not None: + adapter = adapter.to_global(placement=hidden_states.placement) + + bsz, tgt_len = hidden_states.size()[:2] + + query_key_value = self.query_key_value(hidden_states) + query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size) + query_key_value = query_key_value.permute( + 0, 2, 1, 3 + ) # [bsz, num_heads, src_len, 3 * head_size] + query, key, value = flow.chunk(query_key_value, chunks=3, dim=-1) + + kv_seq_len = key.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_embed( + value, seq_len=kv_seq_len, cos_cached=cos_cached, sin_cached=sin_cached + ) + query, key = apply_rotary_pos_emb(query, key, cos, sin, position_ids) + + # [1, adapter_len, 4096] + if adapter is not None: + adapter_len = adapter.shape[1] + adapter_qkv = self.query_key_value(adapter) + adapter_qkv = adapter_qkv.view(1, -1, self.num_heads, 3 * self.head_size) + adapter_qkv = adapter_qkv.permute( + 0, 2, 1, 3 + ) # [1, num_heads, src_len, 3 * head_size] + _, adapter_key, adapter_value = flow.chunk(adapter_qkv, chunks=3, dim=-1) + adapter_key = adapter_key.repeat(bsz, 1, 1, 1) + adapter_value = adapter_value.repeat(bsz, 1, 1, 1) + key = flow.cat([adapter_key, key], dim=2) + value = flow.cat([adapter_value, value], dim=2) + extra_mask = flow.zeros( + bsz, 1, tgt_len, adapter_len, + sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), + placement=attention_mask.placement + ) + attention_mask = flow.cat([extra_mask, attention_mask], dim=-1) + + if past_key_value is not None: + past_key, past_value = past_key_value + key = flow.cat((past_key.type_as(key), key), dim=2) + value = flow.cat((past_value.type_as(value), value), dim=2) + + # query, key, value: [S(0), S(1)], shape: [bsz, num_heads, seq_length, head_size] + if use_cache: + past_key_value = (key, value) + + # [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)] + attention_scores = flow.matmul(query, key, transpose_b=True, alpha=self.norm_factor) + attention_weights = attention_scores + attention_mask + + if adapter is not None: + attention_weights = flow.cat( + [ + self.gate.tanh().half() * F.softmax(attention_weights[:, :, :, :adapter_len].float(), dim=-1).to(query.dtype), + F.softmax(attention_weights[:, :, :, adapter_len:].float(), dim=-1).to(query.dtype), + ], + dim=-1, + ) + else: + attention_weights = flow.softmax(attention_weights, dim=-1) + # Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)] + context = flow.matmul(attention_weights, value) + + # Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size] + context = context.transpose(1, 2) + output = self.o_proj(context.flatten(2)) + + if use_cache: + output = (output, past_key_value) + + return output + + +class CasualMask(nn.Module): + def __init__(self, max_positions=1024, dtype=flow.float16, *, layer_idx=0): + super().__init__() + self.dtype = dtype + self.mask = flow.full( + (max_positions, max_positions), + flow.finfo(dtype).min, + placement=dist.get_layer_placement(layer_idx), + sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), + ) + mask_cond = flow.arange( + self.mask.size(-1), + placement=dist.get_layer_placement(layer_idx), + sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), + ) + self.mask.masked_fill_(mask_cond < (mask_cond + 1).view(self.mask.size(-1), 1), 0) + self.mask = self.mask.to(dtype) + + def forward(self, input_ids, past_length=0, attention_mask=None, input_dtype=None): + bsz, tgt_len = input_ids.size() + casual_mask = self.mask[:tgt_len, :tgt_len] + if past_length > 0: + # in case past_key_values are used, we need to add a prefix ones mask to casual mask + casual_mask = flow.cat( + [flow.ones(tgt_len, past_length, dtype=self.dtype), casual_mask], dim=-1 + ) + casual_mask = ( + casual_mask.unsqueeze(0).unsqueeze(1).expand(bsz, 1, tgt_len, tgt_len + past_length) + ) + casual_mask = casual_mask.to_global(sbp=input_ids.sbp) + if attention_mask is not None: + bsz, src_len = attention_mask.size() + attention_mask = ( + attention_mask[:, None, None, :] + .expand(bsz, 1, tgt_len, src_len) + .to(casual_mask.dtype) + ) + attention_mask = attention_mask.to_global(placement=casual_mask.placement) + casual_mask = casual_mask + attention_mask + if input_dtype is not None: + casual_mask = casual_mask.to(input_dtype) + return casual_mask + + +class LlamaDecoderLayer(nn.Module): + def __init__( + self, + hidden_size, + intermediate_size, + num_attention_heads, + is_decoder=False, + rms_norm_eps=1e-5, + max_position_embeddings=None, + init_method=nn.init.xavier_normal_, + output_layer_init_method=None, + scale_mask_softmax_fusion=False, + attn_mask_type=AttnMaskType.padding, + *, + layer_idx=0, + ): + super().__init__() + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_attention_heads = num_attention_heads + self.rms_norm_eps = rms_norm_eps + self.max_position_embeddings = max_position_embeddings + self.attn_mask_type = attn_mask_type + + self.layer_idx = layer_idx + self.is_decoder = is_decoder + + self.scale_mask_softmax_fusion = scale_mask_softmax_fusion + + self.init_method = init_method + if output_layer_init_method is None: + output_layer_init_method = init_method + self.output_layer_init_method = output_layer_init_method + + self.input_layernorm = RMSLayerNorm( + self.hidden_size, eps=self.rms_norm_eps, layer_idx=self.layer_idx + ) + + self.self_attn = self.build_attention() + self.post_attention_layernorm = RMSLayerNorm( + self.hidden_size, eps=self.rms_norm_eps, layer_idx=self.layer_idx + ) + + self.mlp = MLP( + self.hidden_size, + self.intermediate_size, + self.init_method, + output_layer_init_method=self.output_layer_init_method, + layer_idx=self.layer_idx, + ) + + def forward( + self, + hidden_states, + attention_mask=None, + past_key_value=None, + cos_cached=None, + sin_cached=None, + use_cache=False, + adapter=None, + ): + hidden_states = hidden_states.to_global(placement=dist.get_layer_placement(self.layer_idx)) + + # hidden_states shape: (batch_size, seq_length, hidden_size) + if attention_mask is not None: + attention_mask = attention_mask.to_global( + placement=dist.get_layer_placement(self.layer_idx) + ) + + if past_key_value is not None: + if self.is_decoder: + assert len(past_key_value) == 4 + self_attn_past_key_value = past_key_value[:2] + else: + self_attn_past_key_value = past_key_value + else: + self_attn_past_key_value = None + + layernorm_output = self.input_layernorm(hidden_states) + attention_output = self.self_attn( + layernorm_output, + attention_mask=attention_mask, + past_key_value=self_attn_past_key_value, + cos_cached=cos_cached, + sin_cached=sin_cached, + use_cache=use_cache, + adapter=adapter, + ) + + if use_cache: + attention_output, presents = attention_output + + hidden_states = hidden_states + attention_output + + layernorm_output = self.post_attention_layernorm(hidden_states) + + mlp_output = self.mlp(layernorm_output) + + output = hidden_states + mlp_output + + if use_cache: + output = (output, presents) + return output + + def build_attention(self): + return MultiheadAttention( + self.hidden_size, + self.num_attention_heads, + self.max_position_embeddings, + init_method=self.init_method, + output_layer_init_method=self.output_layer_init_method, + scale_mask_softmax_fusion=self.scale_mask_softmax_fusion, + attn_mask_type=self.attn_mask_type, + layer_idx=self.layer_idx, + ) + + +class LlamaModel(nn.Module): + def __init__( + self, + hidden_layers, + vocab_size, + hidden_size, + intermediate_size, + num_attention_heads, + max_position_embeddings=1024, + rms_norm_eps=1e-5, + initializer_range=0.02, + use_scaled_init_for_output_weights=True, + scale_mask_softmax_fusion=False, + amp_enabled=False, + cfg=None, + ): + super().__init__() + self.cfg = cfg + init_method = init_method_normal(sigma=initializer_range) + if use_scaled_init_for_output_weights: + output_layer_init_method = scaled_init_method_normal(initializer_range, hidden_layers) + else: + output_layer_init_method = init_method + + self.embed_tokens = VocabEmbedding( + vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled + ) + self.layers = nn.ModuleList( + [ + LlamaDecoderLayer( + hidden_size, + intermediate_size, + num_attention_heads, + rms_norm_eps=rms_norm_eps, + max_position_embeddings=max_position_embeddings, + init_method=init_method, + output_layer_init_method=output_layer_init_method, + scale_mask_softmax_fusion=scale_mask_softmax_fusion, + attn_mask_type=AttnMaskType.causal, + layer_idx=i, + ) + for i in range(hidden_layers) + ] + ) + self.norm = RMSLayerNorm(hidden_size, eps=rms_norm_eps, layer_idx=-1) + + self.adapter_query = Embedding(cfg.adapter_len * cfg.adapter_layer, hidden_size, amp_enabled=amp_enabled) + + self._set_cos_sin_cache( + rotary_dim=hidden_size // num_attention_heads, + seq_len=max_position_embeddings, + dtype=flow.float32, + layer_idx=0, + ) + + def _set_cos_sin_cache(self, rotary_dim, seq_len, base=10000, dtype=None, layer_idx=0): + position = flow.arange( + 0, + rotary_dim, + 2, + dtype=dtype, + sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), + placement=dist.get_layer_placement(layer_idx), + ) + inv_freq = 1.0 / (base ** (position / rotary_dim)) + + t = flow.arange( + seq_len, + dtype=inv_freq.dtype, + sbp=inv_freq.sbp, + placement=inv_freq.placement, + ) + + freqs = flow.einsum("i,j->ij", t, inv_freq) + emb = flow.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype)) + self.register_buffer("sin_cached", emb.sin().to(dtype)) + + def forward( + self, + input_ids, + attention_mask=None, + past_key_values=None, + use_cache=False, + set_cache=None, + ): + with flow.no_grad(): + if use_cache: + presents = [] + input_ids = input_ids.to_global(placement=dist.get_layer_placement(0)) + hidden_states = self.embed_tokens(input_ids) + + for layer, past_key_value in zip(self.layers[:-self.cfg.adapter_len], past_key_values[:-self.cfg.adapter_len]): + hidden_states = layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_value=past_key_value, + cos_cached=self.cos_cached, + sin_cached=self.sin_cached, + use_cache=False, + adapter=None, + ) + if use_cache: + hidden_states, present = hidden_states + presents.append(present) + + adapter_index = 0 + # [num_adapter_layer, 1, adapter_len, 4096] + adapter = self.adapter_query.weight.reshape(-1, self.cfg.adapter_len, 4096).unsqueeze(1) + for layer, past_key_value in zip(self.layers[-self.cfg.adapter_len:], past_key_values[-self.cfg.adapter_len:]): + hidden_states = layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_value=past_key_value, + cos_cached=self.cos_cached, + sin_cached=self.sin_cached, + use_cache=False, + adapter=adapter[adapter_index] # [1, adapter_len, 4096] + ) + adapter_index += 1 + if use_cache: + hidden_states, present = hidden_states + presents.append(present) + + hidden_states = self.norm(hidden_states) + + if use_cache: + set_cache(presents) + + return hidden_states + + +class CrossEntropyLoss(nn.Module): + def forward(self, logits: flow.Tensor, target: flow.Tensor): + assert logits.ndim == 3 + assert target.ndim == 2 + assert logits.shape[0:2] == target.shape + + target = target.to_global(placement=logits.placement) + target = target * (target >= 0) + + lm_loss = flow._C.cross_entropy( + logits.view(-1, logits.shape[-1]), + target.view(-1), + ignore_index=0 + ) + return lm_loss + + +class SFTLoss(nn.Module): + def __init__(self) -> None: + super().__init__() + self.lm_loss = CrossEntropyLoss() + + def forward(self, logits, lm_labels): + lm_loss = self.lm_loss(logits, lm_labels) + lm_loss = lm_loss.mean() + return {"lm_loss": lm_loss} + + +class LlamaForCausalLM(nn.Module, Generator): + @configurable + def __init__( + self, + hidden_layers, + vocab_size, + hidden_size, + intermediate_size, + num_attention_heads, + max_position_embeddings=1024, + rms_norm_eps=1e-5, + initializer_range=0.02, + use_scaled_init_for_output_weights=True, + scale_mask_softmax_fusion=False, + amp_enabled=False, + cfg=None, + ): + super().__init__() + self.cfg = cfg + self.model = LlamaModel( + hidden_layers=hidden_layers, + vocab_size=vocab_size, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + num_attention_heads=num_attention_heads, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=rms_norm_eps, + initializer_range=initializer_range, + use_scaled_init_for_output_weights=use_scaled_init_for_output_weights, + scale_mask_softmax_fusion=scale_mask_softmax_fusion, + amp_enabled=amp_enabled, + cfg=cfg, + ) + self.casual_mask = CasualMask(max_position_embeddings, layer_idx=0) + self.lm_head = Linear(hidden_size, vocab_size, bias=False, layer_idx=-1) + self.loss_func = SFTLoss() + + self.past_key_values = [None] * hidden_layers + self.past_length = 0 + + def forward(self, input_ids, attention_mask=None, labels=None, use_cache=False): + input_ids = input_ids.to_global(placement=dist.get_layer_placement(0)) + attention_mask = ( + attention_mask.to_global(placement=dist.get_layer_placement(0)) + if attention_mask is not None + else attention_mask + ) + labels = ( + labels.to_global(placement=dist.get_layer_placement(0)) + if labels is not None + else labels + ) + + if use_cache and self.past_key_values[0] is not None: + self.past_length = self.past_key_values[0][0].size(-2) + else: + self.past_length = 0 + + mask = self.casual_mask( + input_ids, + past_length=self.past_length, + attention_mask=attention_mask, + input_dtype=self.lm_head.weight.dtype, + ) + + output = self.model( + input_ids, + attention_mask=mask, + past_key_values=self.past_key_values, + use_cache=use_cache, + set_cache=self.set_cache, + ) + + logits = self.lm_head(output) + + if labels is not None: + lm_loss = self.loss_func(logits, labels) + return lm_loss + else: + return {"logits": logits} + + def set_cache(self, past_key_values): + self.past_length = 0 if past_key_values is None else past_key_values[0][0].shape[2] + + if past_key_values is None: + past_key_values = [None] * self.cfg.hidden_layers + + assert len(past_key_values) == self.cfg.hidden_layers, ( + f"past_key_values's length {len(past_key_values)} doesn't match " + f"num_layers:' {self.cfg.hidden_layers}" + ) + + def prepare_inputs_for_generation(self, input_ids: flow.Tensor, **kwargs): + if "attention_mask" in kwargs: + attention_mask = kwargs.pop("attention_mask").float() + attention_mask = attention_mask - 1 + attention_mask.masked_fill_(attention_mask == -1, flow.finfo(flow.float32).min) + return {"input_ids": input_ids, "attention_mask": attention_mask} + + @classmethod + def from_config(cls, cfg): + return { + "hidden_layers": cfg.hidden_layers, + "vocab_size": cfg.vocab_size, + "hidden_size": cfg.hidden_size, + "intermediate_size": cfg.intermediate_size, + "num_attention_heads": cfg.num_attention_heads, + "max_position_embeddings": cfg.max_position_embeddings, + "rms_norm_eps": cfg.rms_norm_eps, + "initializer_range": cfg.initializer_range, + "use_scaled_init_for_output_weights": cfg.use_scaled_init_for_output_weights, + "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion, + "amp_enabled": cfg.amp_enabled, + "cfg": cfg, + } + + @staticmethod + def set_activation_checkpoint(model): + for module_block in model.modules(): + # Old API in OneFlow 0.8 + if hasattr(module_block, "origin"): + if isinstance(module_block.origin, LlamaDecoderLayer): + module_block.config.activation_checkpointing = True + else: + if isinstance(module_block.to(nn.Module), LlamaDecoderLayer): + module_block.to(nn.graph.GraphModule).activation_checkpointing = True diff --git a/projects/Llama/adapter/adapter_sft.py b/projects/Llama/adapter/adapter_sft.py new file mode 100644 index 000000000..1c5ad5822 --- /dev/null +++ b/projects/Llama/adapter/adapter_sft.py @@ -0,0 +1,105 @@ +import os +from omegaconf import OmegaConf + +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from libai.scheduler import WarmupExponentialLR +from libai.data.build import build_nlp_test_loader, build_nlp_train_loader + +from configs.common.train import train +from configs.common.models.graph import graph +from configs.common.optim import optim + +from projects.Llama.adapter.adapter_config import cfg +from projects.Llama.adapter.dataset import AlpacaDataset +from projects.Llama.tokenizer import LlamaTokenizer +from projects.Llama.adapter.adapter_model import LlamaForCausalLM + + +# Hyperparameters +weight_decay = 0.1 +learning_rate = 2e-5 +max_input_length = 512 +dataset_path = "/data/home/xiezipeng/datasets/alpaca_data.json" +pretrained_model_path = "/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/" + +# graph & optim +graph["enabled"] = False +optim.update( + dict( + lr=learning_rate, + weight_decay=weight_decay, + ) +) + +# tokenize +tokenization = OmegaConf.create() +tokenization.make_vocab_size_divisible_by = 1 +tokenization.tokenizer = LazyCall(LlamaTokenizer)( + pretrained_model_path=os.path.join(pretrained_model_path, "tokenizer.model") +) + +# model +cfg.use_cache = False +model = LazyCall(LlamaForCausalLM)(cfg=cfg) + +# datasets +dataloader = OmegaConf.create() +dataloader.train = LazyCall(build_nlp_train_loader)( + dataset=[ + LazyCall(AlpacaDataset)( + path=dataset_path, + tokenizer=tokenization.tokenizer, + max_len=max_input_length, + partition="train" + ) + ], +) +dataloader.test = [ + LazyCall(build_nlp_test_loader)( + dataset=LazyCall(AlpacaDataset)( + path=dataset_path, + tokenizer=tokenization.tokenizer, + max_len=max_input_length, + partition="test" + ), + ), +] + + +train.update( + dict( + output_dir="./sft_result", + train_micro_batch_size=8, + test_micro_batch_size=1, + train_epoch=5, + train_iter=1, + log_period=10, + warmup_ratio=2 / 5, + num_accumulation_steps=8, + rdma_enabled=False, + amp=dict(enabled=True), + activation_checkpoint=dict(enabled=True), + checkpointer=dict( + period=5000, + max_to_keep=20, + ), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=8, + pipeline_num_layers=cfg.hidden_layers, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_period=100, + eval_iter=100, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + ), + ) +) diff --git a/projects/Llama/adapter/dataset.py b/projects/Llama/adapter/dataset.py new file mode 100644 index 000000000..bbf31240e --- /dev/null +++ b/projects/Llama/adapter/dataset.py @@ -0,0 +1,65 @@ + + +import copy +import json + +import oneflow as flow +from oneflow.utils.data import Dataset + +from libai.data.structures import DistTensorData, Instance + + +PROMPT_DICT = { + "prompt_input": ( + "Below is an instruction that describes a task, paired with an input that provides further context. " + "Write a response that appropriately completes the request.\n\n" + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" + ), + "prompt_no_input": ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + "### Instruction:\n{instruction}\n\n### Response:" + ), +} + +class AlpacaDataset(Dataset): + def __init__(self, path, tokenizer, max_len=513, partition="train"): + self.data = json.load(open(path)) + self.tokenizer = tokenizer + self.max_len = max_len + if partition == "train": + self.data = self.data + else: + self.data = self.data[:200] + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + data = self.data[index] + if data.get("input", "") == "": + prompt = PROMPT_DICT["prompt_no_input"].format_map(data) + else: + prompt = PROMPT_DICT["prompt_input"].format_map(data) + example = prompt + data["output"] + prompt = self.tokenizer.tokenize(prompt, add_bos=True, add_eos=False, device="cpu")[0] + example = self.tokenizer.tokenize(example, add_bos=True, add_eos=True, device="cpu")[0] + padding = self.max_len - example.shape[0] + if padding > 0: + example = flow.cat((example, flow.zeros(padding, dtype=flow.long) - 1)) + elif padding < 0: + example = example[: self.max_len] + labels = copy.deepcopy(example) + labels[: len(prompt)] = -1 + example_mask = example.ge(0) + label_mask = labels.ge(0) + example[~example_mask] = 0 + labels[~label_mask] = -1 + example = example[:-1] + labels = labels[1:] + example_mask = flow.where(example_mask, flow.tensor(0, dtype=flow.float), flow.tensor(-float('inf'))) + example_mask = example_mask[:-1] + return Instance( + input_ids=DistTensorData(example), + labels=DistTensorData(labels), + ) diff --git a/projects/Llama/adapter/train_net.py b/projects/Llama/adapter/train_net.py new file mode 100644 index 000000000..0c44dff92 --- /dev/null +++ b/projects/Llama/adapter/train_net.py @@ -0,0 +1,115 @@ +# coding=utf-8 +# Copyright 2021 The OneFlow Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import random +import sys + +import numpy as np +import oneflow as flow + +import libai.utils.distributed as dist +from libai.config import LazyConfig, default_argument_parser, try_get_key +from libai.engine import DefaultTrainer, default_setup +from libai.utils.checkpoint import Checkpointer +from projects.Llama.utils.llama_loader import LlamaLoaderHuggerFace + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) + +logger = logging.getLogger("libai." + __name__) + + +def build_model(cfg): + model_loader = LlamaLoaderHuggerFace( + cfg, + cfg.cfg, + cfg.cfg.pretrained_model_path, + ) + model = model_loader.load() + + for name, param in model.named_parameters(): + if "adapter" not in name: + param.requires_grad = False + else: + param.requires_grad = True + param.data = param.data.float() + + for name, param in model.model.layers[-cfg.cfg.adapter_layer : ].named_parameters(): + if "gate" in name or "adapter" in name: + param.data = param.data.float() + param.requires_grad = True + + return model + + +class LlamaTrainer(DefaultTrainer): + @classmethod + def build_model(cls, cfg): + assert try_get_key(cfg, "model") is not None, "cfg must contain `model` namespace" + # Set model fp16 option because of embedding layer `white_identity` manual + # insert for amp training if provided. + if try_get_key(cfg.model, "cfg.amp_enabled") is not None: + cfg.model.cfg.amp_enabled = cfg.train.amp.enabled and cfg.graph.enabled + # In case some model define without cfg keyword. + elif try_get_key(cfg.model, "amp_enabled") is not None: + cfg.model.amp_enabled = cfg.train.amp.enabled and cfg.graph.enabled + model = build_model(cfg.model) + logger = logging.getLogger(__name__) + logger.info("Model:\n{}".format(model)) + model._apply(dist.convert_to_distributed_default_setting) + return model + + +def main(args): + cfg = LazyConfig.load(args.config_file) + cfg = LazyConfig.apply_overrides(cfg, args.opts) + default_setup(cfg, args) + + seed_for_rank = cfg.train.seed + flow.env.get_rank() + flow.manual_seed(seed_for_rank) + flow.cuda.manual_seed(seed_for_rank) + np.random.seed(seed_for_rank) + random.seed(seed_for_rank) + + if args.fast_dev_run: + cfg.train.train_epoch = 0 + cfg.train.train_iter = 20 + cfg.train.evaluation.eval_period = 10 + cfg.train.log_period = 1 + + if args.eval_only: + tokenizer = None + if try_get_key(cfg, "tokenization") is not None: + tokenizer = DefaultTrainer.build_tokenizer(cfg) + model = DefaultTrainer.build_model(cfg) + Checkpointer(model, save_dir=cfg.train.output_dir).resume_or_load( + cfg.train.load_weight, resume=args.resume + ) + if try_get_key(cfg, "graph.enabled", default=False): + model = DefaultTrainer.build_graph(cfg, model, is_train=False) + test_loader = DefaultTrainer.build_test_loader(cfg, tokenizer) + if len(test_loader) == 0: + logger.info("No dataset in dataloader.test, please set dataset for dataloader.test") + _ = DefaultTrainer.test(cfg, test_loader, model) + return + + trainer = LlamaTrainer(cfg) + return trainer.train() + + +if __name__ == "__main__": + args = default_argument_parser().parse_args() + main(args) diff --git a/projects/Llama/configs/llama_config.py b/projects/Llama/configs/llama_config.py index 715b96dd3..2fb3ba4de 100644 --- a/projects/Llama/configs/llama_config.py +++ b/projects/Llama/configs/llama_config.py @@ -15,7 +15,6 @@ max_position_embeddings=4096, num_attention_heads=32, hidden_layers=32, - num_key_value_heads=32, pretraining_tp=1, rms_norm_eps=1e-05, rope_scaling=None, diff --git a/projects/Llama/dataset.py b/projects/Llama/dataset.py index b500998b1..f10c04c53 100644 --- a/projects/Llama/dataset.py +++ b/projects/Llama/dataset.py @@ -1,17 +1,4 @@ -# coding=utf-8 -# Copyright 2021 The OneFlow Authors. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +"""Implementation derived from https://github.com/OpenGVLab/LLaMA-Adapter""" import random diff --git a/projects/Llama/llama.py b/projects/Llama/llama.py index c1cf44c81..8a07ccdae 100644 --- a/projects/Llama/llama.py +++ b/projects/Llama/llama.py @@ -393,7 +393,6 @@ def __init__( hidden_size, intermediate_size, num_attention_heads, - num_key_value_heads, max_position_embeddings=1024, rms_norm_eps=1e-5, initializer_range=0.02, @@ -494,10 +493,27 @@ def forward( return hidden_states +class CrossEntropyLoss(nn.Module): + def forward(self, logits: flow.Tensor, target: flow.Tensor): + assert logits.ndim == 3 + assert target.ndim == 2 + assert logits.shape[0:2] == target.shape + + target = target.to_global(placement=logits.placement) + target = target * (target >= 0) + + lm_loss = flow._C.cross_entropy( + logits.view(-1, logits.shape[-1]), + target.view(-1), + ignore_index=0 + ) + return lm_loss + + class SFTLoss(nn.Module): def __init__(self) -> None: super().__init__() - self.lm_loss = ParallelCrossEntropyLoss() + self.lm_loss = CrossEntropyLoss() def forward(self, logits, lm_labels): lm_loss = self.lm_loss(logits, lm_labels) @@ -514,7 +530,6 @@ def __init__( hidden_size, intermediate_size, num_attention_heads, - num_key_value_heads, max_position_embeddings=1024, rms_norm_eps=1e-5, initializer_range=0.02, @@ -531,7 +546,6 @@ def __init__( hidden_size=hidden_size, intermediate_size=intermediate_size, num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, max_position_embeddings=max_position_embeddings, rms_norm_eps=rms_norm_eps, initializer_range=initializer_range, @@ -613,7 +627,6 @@ def from_config(cls, cfg): "hidden_size": cfg.hidden_size, "intermediate_size": cfg.intermediate_size, "num_attention_heads": cfg.num_attention_heads, - "num_key_value_heads": cfg.num_key_value_heads, "max_position_embeddings": cfg.max_position_embeddings, "rms_norm_eps": cfg.rms_norm_eps, "initializer_range": cfg.initializer_range, diff --git a/projects/Llama/utils/llama_loader.py b/projects/Llama/utils/llama_loader.py index 59b46343a..20b9ba258 100644 --- a/projects/Llama/utils/llama_loader.py +++ b/projects/Llama/utils/llama_loader.py @@ -43,8 +43,6 @@ def _convert_state_dict(self, flow_state_dict, cfg): # Get configs num_attention_heads = cfg.get("num_attention_heads") - num_key_value_heads = cfg.get("num_key_value_heads") - assert num_attention_heads == num_key_value_heads hidden_size = cfg.get("hidden_size") head_size = int(hidden_size // num_attention_heads) @@ -83,7 +81,6 @@ def _load_config_from_json(self, config_file): self._update_cfg("hidden_layers", cfg_dict["num_hidden_layers"]) self._update_cfg("hidden_size", cfg_dict["hidden_size"]) self._update_cfg("num_attention_heads", cfg_dict["num_attention_heads"]) - self._update_cfg("num_key_value_heads", cfg_dict["num_key_value_heads"]) self._update_cfg("max_position_embeddings", cfg_dict["max_position_embeddings"]) self._update_cfg("intermediate_size", cfg_dict["intermediate_size"]) self._update_cfg("rms_norm_eps", cfg_dict["rms_norm_eps"]) From df37b00762944ea76b019e389359d1777f555501 Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Fri, 5 Jan 2024 10:35:16 +0800 Subject: [PATCH 2/9] refine --- projects/Llama/adapter/adapter_model.py | 4 ++-- projects/Llama/configs/llama_config.py | 4 ++-- projects/Llama/configs/llama_sft.py | 6 +++--- projects/Llama/utils/prepare_alpaca.py | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/projects/Llama/adapter/adapter_model.py b/projects/Llama/adapter/adapter_model.py index 0c7982be9..39743f6e6 100644 --- a/projects/Llama/adapter/adapter_model.py +++ b/projects/Llama/adapter/adapter_model.py @@ -521,7 +521,7 @@ def forward( input_ids = input_ids.to_global(placement=dist.get_layer_placement(0)) hidden_states = self.embed_tokens(input_ids) - for layer, past_key_value in zip(self.layers[:-self.cfg.adapter_len], past_key_values[:-self.cfg.adapter_len]): + for layer, past_key_value in zip(self.layers[:-self.cfg.adapter_layer], past_key_values[:-self.cfg.adapter_layer]): hidden_states = layer( hidden_states=hidden_states, attention_mask=attention_mask, @@ -538,7 +538,7 @@ def forward( adapter_index = 0 # [num_adapter_layer, 1, adapter_len, 4096] adapter = self.adapter_query.weight.reshape(-1, self.cfg.adapter_len, 4096).unsqueeze(1) - for layer, past_key_value in zip(self.layers[-self.cfg.adapter_len:], past_key_values[-self.cfg.adapter_len:]): + for layer, past_key_value in zip(self.layers[-self.cfg.adapter_layer:], past_key_values[-self.cfg.adapter_layer:]): hidden_states = layer( hidden_states=hidden_states, attention_mask=attention_mask, diff --git a/projects/Llama/configs/llama_config.py b/projects/Llama/configs/llama_config.py index 2fb3ba4de..fb91625a7 100644 --- a/projects/Llama/configs/llama_config.py +++ b/projects/Llama/configs/llama_config.py @@ -48,7 +48,7 @@ eos_token_id=2, pad_token_id=0, # train - pretrained_model_path="meta-llama/Llama-2-7b-hf", + pretrained_model_path="/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/", ) cfg = DictConfig(cfg) @@ -57,5 +57,5 @@ tokenization = OmegaConf.create() tokenization.make_vocab_size_divisible_by = 1 tokenization.tokenizer = LazyCall(LlamaTokenizer)( - pretrained_model_path="Llama-2-7b-hf/tokenizer.model" + pretrained_model_path="/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/tokenizer.model" ) diff --git a/projects/Llama/configs/llama_sft.py b/projects/Llama/configs/llama_sft.py index 322c88c1a..f3e6806af 100644 --- a/projects/Llama/configs/llama_sft.py +++ b/projects/Llama/configs/llama_sft.py @@ -20,8 +20,8 @@ weight_decay = 0.1 learning_rate = 2e-5 max_input_length = 1350 -dataset_path = "alpaca_data" -pretrained_model_path = "meta-llama/Llama-2-7b-hf" +dataset_path = "/data/home/xiezipeng/libai/projects/Llama/alpaca_data" +pretrained_model_path = "/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/" # graph & optim graph["enabled"] = True @@ -72,7 +72,7 @@ train_epoch=5, train_iter=1, log_period=10, - warmup_ratio=2 / 5, + warmup_ratio=0, num_accumulation_steps=8, rdma_enabled=True, amp=dict(enabled=True), diff --git a/projects/Llama/utils/prepare_alpaca.py b/projects/Llama/utils/prepare_alpaca.py index ce2fc92b1..2efa51266 100644 --- a/projects/Llama/utils/prepare_alpaca.py +++ b/projects/Llama/utils/prepare_alpaca.py @@ -18,8 +18,8 @@ def prepare( - destination_path: Path = Path("/alpaca_data"), - checkpoint_dir: Path = Path("/Llama-2-7b-hf"), + destination_path: Path = Path("/data/home/xiezipeng/libai/projects/Llama/alpaca_data"), + checkpoint_dir: Path = Path("/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/"), test_split_fraction: float = 0.03865, # to get exactly 2000 test samples, seed: int = 42, mask_inputs: bool = False, # as in alpaca-lora From a96cf21cd60ad567b83eb439c84276e4e74f8ea7 Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Fri, 5 Jan 2024 10:36:47 +0800 Subject: [PATCH 3/9] reformat --- projects/Llama/adapter/adapter_config.py | 3 +- projects/Llama/adapter/adapter_model.py | 51 +++++++++++++++--------- projects/Llama/adapter/adapter_sft.py | 18 ++++----- projects/Llama/adapter/dataset.py | 8 ++-- projects/Llama/adapter/train_net.py | 4 +- projects/Llama/llama.py | 4 +- 6 files changed, 48 insertions(+), 40 deletions(-) diff --git a/projects/Llama/adapter/adapter_config.py b/projects/Llama/adapter/adapter_config.py index 9732fad47..37d67c741 100644 --- a/projects/Llama/adapter/adapter_config.py +++ b/projects/Llama/adapter/adapter_config.py @@ -1,10 +1,9 @@ from omegaconf import DictConfig, OmegaConf +from configs.common.train import train from libai.config import LazyCall from projects.Llama.adapter.adapter_model import LlamaForCausalLM from projects.Llama.tokenizer import LlamaTokenizer -from configs.common.train import train - cfg = dict( # Model diff --git a/projects/Llama/adapter/adapter_model.py b/projects/Llama/adapter/adapter_model.py index 39743f6e6..78a72b4fe 100644 --- a/projects/Llama/adapter/adapter_model.py +++ b/projects/Llama/adapter/adapter_model.py @@ -17,12 +17,12 @@ from typing import Tuple import oneflow as flow -from oneflow import nn import oneflow.nn.functional as F +from oneflow import nn from libai.config import configurable from libai.inference.generator.generation_utils import Generator -from libai.layers import Linear, RMSLayerNorm, VocabEmbedding, Embedding +from libai.layers import Embedding, Linear, RMSLayerNorm, VocabEmbedding from libai.layers.attention import AttnMaskType from libai.models.utils import init_method_normal, scaled_init_method_normal from libai.utils import distributed as dist @@ -168,7 +168,10 @@ def __init__( self.gate = flow.nn.Parameter( flow.zeros( - 1, self.num_heads, 1, 1, + 1, + self.num_heads, + 1, + 1, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), ) @@ -184,7 +187,7 @@ def forward( cos_cached: flow.Tensor = None, sin_cached: flow.Tensor = None, use_cache: bool = False, - adapter = None, + adapter=None, ): if encoder_states is not None: encoder_states = encoder_states.to_global(placement=hidden_states.placement) @@ -217,18 +220,19 @@ def forward( adapter_len = adapter.shape[1] adapter_qkv = self.query_key_value(adapter) adapter_qkv = adapter_qkv.view(1, -1, self.num_heads, 3 * self.head_size) - adapter_qkv = adapter_qkv.permute( - 0, 2, 1, 3 - ) # [1, num_heads, src_len, 3 * head_size] + adapter_qkv = adapter_qkv.permute(0, 2, 1, 3) # [1, num_heads, src_len, 3 * head_size] _, adapter_key, adapter_value = flow.chunk(adapter_qkv, chunks=3, dim=-1) adapter_key = adapter_key.repeat(bsz, 1, 1, 1) adapter_value = adapter_value.repeat(bsz, 1, 1, 1) key = flow.cat([adapter_key, key], dim=2) value = flow.cat([adapter_value, value], dim=2) extra_mask = flow.zeros( - bsz, 1, tgt_len, adapter_len, - sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=attention_mask.placement + bsz, + 1, + tgt_len, + adapter_len, + sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), + placement=attention_mask.placement, ) attention_mask = flow.cat([extra_mask, attention_mask], dim=-1) @@ -248,8 +252,13 @@ def forward( if adapter is not None: attention_weights = flow.cat( [ - self.gate.tanh().half() * F.softmax(attention_weights[:, :, :, :adapter_len].float(), dim=-1).to(query.dtype), - F.softmax(attention_weights[:, :, :, adapter_len:].float(), dim=-1).to(query.dtype), + self.gate.tanh().half() + * F.softmax(attention_weights[:, :, :, :adapter_len].float(), dim=-1).to( + query.dtype + ), + F.softmax(attention_weights[:, :, :, adapter_len:].float(), dim=-1).to( + query.dtype + ), ], dim=-1, ) @@ -475,7 +484,9 @@ def __init__( ) self.norm = RMSLayerNorm(hidden_size, eps=rms_norm_eps, layer_idx=-1) - self.adapter_query = Embedding(cfg.adapter_len * cfg.adapter_layer, hidden_size, amp_enabled=amp_enabled) + self.adapter_query = Embedding( + cfg.adapter_len * cfg.adapter_layer, hidden_size, amp_enabled=amp_enabled + ) self._set_cos_sin_cache( rotary_dim=hidden_size // num_attention_heads, @@ -521,7 +532,9 @@ def forward( input_ids = input_ids.to_global(placement=dist.get_layer_placement(0)) hidden_states = self.embed_tokens(input_ids) - for layer, past_key_value in zip(self.layers[:-self.cfg.adapter_layer], past_key_values[:-self.cfg.adapter_layer]): + for layer, past_key_value in zip( + self.layers[: -self.cfg.adapter_layer], past_key_values[: -self.cfg.adapter_layer] + ): hidden_states = layer( hidden_states=hidden_states, attention_mask=attention_mask, @@ -538,7 +551,9 @@ def forward( adapter_index = 0 # [num_adapter_layer, 1, adapter_len, 4096] adapter = self.adapter_query.weight.reshape(-1, self.cfg.adapter_len, 4096).unsqueeze(1) - for layer, past_key_value in zip(self.layers[-self.cfg.adapter_layer:], past_key_values[-self.cfg.adapter_layer:]): + for layer, past_key_value in zip( + self.layers[-self.cfg.adapter_layer :], past_key_values[-self.cfg.adapter_layer :] + ): hidden_states = layer( hidden_states=hidden_states, attention_mask=attention_mask, @@ -546,7 +561,7 @@ def forward( cos_cached=self.cos_cached, sin_cached=self.sin_cached, use_cache=False, - adapter=adapter[adapter_index] # [1, adapter_len, 4096] + adapter=adapter[adapter_index], # [1, adapter_len, 4096] ) adapter_index += 1 if use_cache: @@ -571,9 +586,7 @@ def forward(self, logits: flow.Tensor, target: flow.Tensor): target = target * (target >= 0) lm_loss = flow._C.cross_entropy( - logits.view(-1, logits.shape[-1]), - target.view(-1), - ignore_index=0 + logits.view(-1, logits.shape[-1]), target.view(-1), ignore_index=0 ) return lm_loss diff --git a/projects/Llama/adapter/adapter_sft.py b/projects/Llama/adapter/adapter_sft.py index 1c5ad5822..d91b79f58 100644 --- a/projects/Llama/adapter/adapter_sft.py +++ b/projects/Llama/adapter/adapter_sft.py @@ -1,20 +1,18 @@ import os + from omegaconf import OmegaConf +from configs.common.models.graph import graph +from configs.common.optim import optim +from configs.common.train import train from libai.config import LazyCall +from libai.data.build import build_nlp_test_loader, build_nlp_train_loader from libai.evaluation import PPLEvaluator from libai.scheduler import WarmupExponentialLR -from libai.data.build import build_nlp_test_loader, build_nlp_train_loader - -from configs.common.train import train -from configs.common.models.graph import graph -from configs.common.optim import optim - from projects.Llama.adapter.adapter_config import cfg +from projects.Llama.adapter.adapter_model import LlamaForCausalLM from projects.Llama.adapter.dataset import AlpacaDataset from projects.Llama.tokenizer import LlamaTokenizer -from projects.Llama.adapter.adapter_model import LlamaForCausalLM - # Hyperparameters weight_decay = 0.1 @@ -51,7 +49,7 @@ path=dataset_path, tokenizer=tokenization.tokenizer, max_len=max_input_length, - partition="train" + partition="train", ) ], ) @@ -61,7 +59,7 @@ path=dataset_path, tokenizer=tokenization.tokenizer, max_len=max_input_length, - partition="test" + partition="test", ), ), ] diff --git a/projects/Llama/adapter/dataset.py b/projects/Llama/adapter/dataset.py index bbf31240e..701a98556 100644 --- a/projects/Llama/adapter/dataset.py +++ b/projects/Llama/adapter/dataset.py @@ -1,5 +1,3 @@ - - import copy import json @@ -8,7 +6,6 @@ from libai.data.structures import DistTensorData, Instance - PROMPT_DICT = { "prompt_input": ( "Below is an instruction that describes a task, paired with an input that provides further context. " @@ -22,6 +19,7 @@ ), } + class AlpacaDataset(Dataset): def __init__(self, path, tokenizer, max_len=513, partition="train"): self.data = json.load(open(path)) @@ -57,7 +55,9 @@ def __getitem__(self, index): labels[~label_mask] = -1 example = example[:-1] labels = labels[1:] - example_mask = flow.where(example_mask, flow.tensor(0, dtype=flow.float), flow.tensor(-float('inf'))) + example_mask = flow.where( + example_mask, flow.tensor(0, dtype=flow.float), flow.tensor(-float("inf")) + ) example_mask = example_mask[:-1] return Instance( input_ids=DistTensorData(example), diff --git a/projects/Llama/adapter/train_net.py b/projects/Llama/adapter/train_net.py index 0c44dff92..327b95e6f 100644 --- a/projects/Llama/adapter/train_net.py +++ b/projects/Llama/adapter/train_net.py @@ -39,7 +39,7 @@ def build_model(cfg): cfg.cfg.pretrained_model_path, ) model = model_loader.load() - + for name, param in model.named_parameters(): if "adapter" not in name: param.requires_grad = False @@ -47,7 +47,7 @@ def build_model(cfg): param.requires_grad = True param.data = param.data.float() - for name, param in model.model.layers[-cfg.cfg.adapter_layer : ].named_parameters(): + for name, param in model.model.layers[-cfg.cfg.adapter_layer :].named_parameters(): if "gate" in name or "adapter" in name: param.data = param.data.float() param.requires_grad = True diff --git a/projects/Llama/llama.py b/projects/Llama/llama.py index 8a07ccdae..83f56b731 100644 --- a/projects/Llama/llama.py +++ b/projects/Llama/llama.py @@ -503,9 +503,7 @@ def forward(self, logits: flow.Tensor, target: flow.Tensor): target = target * (target >= 0) lm_loss = flow._C.cross_entropy( - logits.view(-1, logits.shape[-1]), - target.view(-1), - ignore_index=0 + logits.view(-1, logits.shape[-1]), target.view(-1), ignore_index=0 ) return lm_loss From 66af63c95cc87420920d1eadd393bcf2960f0ea2 Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Tue, 9 Jan 2024 13:14:17 +0800 Subject: [PATCH 4/9] refine --- projects/Llama/adapter/adapter_sft.py | 2 +- projects/Llama/adapter/dataset.py | 65 ---------- projects/Llama/configs/llama_config.py | 4 +- projects/Llama/configs/llama_sft.py | 34 ++--- projects/Llama/dataset.py | 60 ++++++--- projects/Llama/readme.md | 8 +- projects/Llama/utils/eval_adapter.py | 2 +- projects/Llama/utils/prepare_alpaca.py | 164 ------------------------- 8 files changed, 69 insertions(+), 270 deletions(-) delete mode 100644 projects/Llama/adapter/dataset.py delete mode 100644 projects/Llama/utils/prepare_alpaca.py diff --git a/projects/Llama/adapter/adapter_sft.py b/projects/Llama/adapter/adapter_sft.py index d91b79f58..650f606c3 100644 --- a/projects/Llama/adapter/adapter_sft.py +++ b/projects/Llama/adapter/adapter_sft.py @@ -11,7 +11,7 @@ from libai.scheduler import WarmupExponentialLR from projects.Llama.adapter.adapter_config import cfg from projects.Llama.adapter.adapter_model import LlamaForCausalLM -from projects.Llama.adapter.dataset import AlpacaDataset +from projects.Llama.dataset import AlpacaDataset from projects.Llama.tokenizer import LlamaTokenizer # Hyperparameters diff --git a/projects/Llama/adapter/dataset.py b/projects/Llama/adapter/dataset.py deleted file mode 100644 index 701a98556..000000000 --- a/projects/Llama/adapter/dataset.py +++ /dev/null @@ -1,65 +0,0 @@ -import copy -import json - -import oneflow as flow -from oneflow.utils.data import Dataset - -from libai.data.structures import DistTensorData, Instance - -PROMPT_DICT = { - "prompt_input": ( - "Below is an instruction that describes a task, paired with an input that provides further context. " - "Write a response that appropriately completes the request.\n\n" - "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" - ), - "prompt_no_input": ( - "Below is an instruction that describes a task. " - "Write a response that appropriately completes the request.\n\n" - "### Instruction:\n{instruction}\n\n### Response:" - ), -} - - -class AlpacaDataset(Dataset): - def __init__(self, path, tokenizer, max_len=513, partition="train"): - self.data = json.load(open(path)) - self.tokenizer = tokenizer - self.max_len = max_len - if partition == "train": - self.data = self.data - else: - self.data = self.data[:200] - - def __len__(self): - return len(self.data) - - def __getitem__(self, index): - data = self.data[index] - if data.get("input", "") == "": - prompt = PROMPT_DICT["prompt_no_input"].format_map(data) - else: - prompt = PROMPT_DICT["prompt_input"].format_map(data) - example = prompt + data["output"] - prompt = self.tokenizer.tokenize(prompt, add_bos=True, add_eos=False, device="cpu")[0] - example = self.tokenizer.tokenize(example, add_bos=True, add_eos=True, device="cpu")[0] - padding = self.max_len - example.shape[0] - if padding > 0: - example = flow.cat((example, flow.zeros(padding, dtype=flow.long) - 1)) - elif padding < 0: - example = example[: self.max_len] - labels = copy.deepcopy(example) - labels[: len(prompt)] = -1 - example_mask = example.ge(0) - label_mask = labels.ge(0) - example[~example_mask] = 0 - labels[~label_mask] = -1 - example = example[:-1] - labels = labels[1:] - example_mask = flow.where( - example_mask, flow.tensor(0, dtype=flow.float), flow.tensor(-float("inf")) - ) - example_mask = example_mask[:-1] - return Instance( - input_ids=DistTensorData(example), - labels=DistTensorData(labels), - ) diff --git a/projects/Llama/configs/llama_config.py b/projects/Llama/configs/llama_config.py index fb91625a7..bb810c53f 100644 --- a/projects/Llama/configs/llama_config.py +++ b/projects/Llama/configs/llama_config.py @@ -48,7 +48,7 @@ eos_token_id=2, pad_token_id=0, # train - pretrained_model_path="/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/", + pretrained_model_path="Llama-2-7b-hf", ) cfg = DictConfig(cfg) @@ -57,5 +57,5 @@ tokenization = OmegaConf.create() tokenization.make_vocab_size_divisible_by = 1 tokenization.tokenizer = LazyCall(LlamaTokenizer)( - pretrained_model_path="/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/tokenizer.model" + pretrained_model_path="Llama-2-7b-hf/tokenizer.model" ) diff --git a/projects/Llama/configs/llama_sft.py b/projects/Llama/configs/llama_sft.py index f3e6806af..14ecdc281 100644 --- a/projects/Llama/configs/llama_sft.py +++ b/projects/Llama/configs/llama_sft.py @@ -18,13 +18,13 @@ # Hyperparameters weight_decay = 0.1 -learning_rate = 2e-5 -max_input_length = 1350 -dataset_path = "/data/home/xiezipeng/libai/projects/Llama/alpaca_data" -pretrained_model_path = "/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/" +learning_rate = 5e-5 +max_input_length = 512 +dataset_path = "alpaca_data.json" +pretrained_model_path = "Llama-2-7b-hf" # graph & optim -graph["enabled"] = True +graph["enabled"] = False optim.update( dict( lr=learning_rate, @@ -47,18 +47,20 @@ dataloader.train = LazyCall(build_nlp_train_loader)( dataset=[ LazyCall(AlpacaDataset)( - path=os.path.join(dataset_path, "train"), + path=dataset_path, tokenizer=tokenization.tokenizer, max_len=max_input_length, + partition="train", ) ], ) dataloader.test = [ LazyCall(build_nlp_test_loader)( dataset=LazyCall(AlpacaDataset)( - path=os.path.join(dataset_path, "test"), + path=dataset_path, tokenizer=tokenization.tokenizer, max_len=max_input_length, + partition="test", ), ), ] @@ -67,30 +69,30 @@ train.update( dict( output_dir="./sft_result", - train_micro_batch_size=2, + train_micro_batch_size=4, test_micro_batch_size=1, - train_epoch=5, + train_epoch=1, train_iter=1, log_period=10, - warmup_ratio=0, + warmup_ratio=2/5, num_accumulation_steps=8, - rdma_enabled=True, + rdma_enabled=False, amp=dict(enabled=True), activation_checkpoint=dict(enabled=True), checkpointer=dict( - period=100, + period=1000, max_to_keep=20, ), dist=dict( - data_parallel_size=2, - tensor_parallel_size=1, - pipeline_parallel_size=4, + data_parallel_size=1, + tensor_parallel_size=8, + pipeline_parallel_size=1, pipeline_num_layers=cfg.hidden_layers, ), evaluation=dict( enabled=True, evaluator=LazyCall(PPLEvaluator)(), - eval_period=100, + eval_period=1000, eval_iter=1e5, ), scheduler=LazyCall(WarmupExponentialLR)( diff --git a/projects/Llama/dataset.py b/projects/Llama/dataset.py index f10c04c53..701a98556 100644 --- a/projects/Llama/dataset.py +++ b/projects/Llama/dataset.py @@ -1,33 +1,65 @@ -"""Implementation derived from https://github.com/OpenGVLab/LLaMA-Adapter""" - -import random +import copy +import json import oneflow as flow from oneflow.utils.data import Dataset from libai.data.structures import DistTensorData, Instance - -def pad_right(data, pad_id=0, max_len=1350): - n = max_len - data.shape[0] - return flow.cat((data, flow.full((n,), pad_id, dtype=data.dtype))) +PROMPT_DICT = { + "prompt_input": ( + "Below is an instruction that describes a task, paired with an input that provides further context. " + "Write a response that appropriately completes the request.\n\n" + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" + ), + "prompt_no_input": ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + "### Instruction:\n{instruction}\n\n### Response:" + ), +} class AlpacaDataset(Dataset): - def __init__(self, path, tokenizer, max_len=1350): - self.data = flow.load(path) - random.shuffle(self.data) + def __init__(self, path, tokenizer, max_len=513, partition="train"): + self.data = json.load(open(path)) self.tokenizer = tokenizer self.max_len = max_len + if partition == "train": + self.data = self.data + else: + self.data = self.data[:200] def __len__(self): return len(self.data) def __getitem__(self, index): - input_ids = pad_right(self.data[index]["input_ids"], pad_id=0, max_len=self.max_len) - labels = pad_right(self.data[index]["labels"], pad_id=-1, max_len=self.max_len) - + data = self.data[index] + if data.get("input", "") == "": + prompt = PROMPT_DICT["prompt_no_input"].format_map(data) + else: + prompt = PROMPT_DICT["prompt_input"].format_map(data) + example = prompt + data["output"] + prompt = self.tokenizer.tokenize(prompt, add_bos=True, add_eos=False, device="cpu")[0] + example = self.tokenizer.tokenize(example, add_bos=True, add_eos=True, device="cpu")[0] + padding = self.max_len - example.shape[0] + if padding > 0: + example = flow.cat((example, flow.zeros(padding, dtype=flow.long) - 1)) + elif padding < 0: + example = example[: self.max_len] + labels = copy.deepcopy(example) + labels[: len(prompt)] = -1 + example_mask = example.ge(0) + label_mask = labels.ge(0) + example[~example_mask] = 0 + labels[~label_mask] = -1 + example = example[:-1] + labels = labels[1:] + example_mask = flow.where( + example_mask, flow.tensor(0, dtype=flow.float), flow.tensor(-float("inf")) + ) + example_mask = example_mask[:-1] return Instance( - input_ids=DistTensorData(input_ids), + input_ids=DistTensorData(example), labels=DistTensorData(labels), ) diff --git a/projects/Llama/readme.md b/projects/Llama/readme.md index a7ab82577..d205796b4 100644 --- a/projects/Llama/readme.md +++ b/projects/Llama/readme.md @@ -10,13 +10,7 @@ FineTuning llama2 on 8 GPUs using parallelism. ### 1. Prepare the alpaca dataset -> set the parameters in `projects/Llama/utils/prepare_alpaca.py` for prepare the datasets, such as `destination_path` and `checkpoint_dir`. - -> Get the alpaca dataset files by running: -```python3 -# path/to/libai -python projects/Llama/utils/prepare_alpaca.py -``` +> Alpaca Dataset address: https://huggingface.co/datasets/vicgalle/alpaca-gpt4 ### 2. Prepare your finetuning config file diff --git a/projects/Llama/utils/eval_adapter.py b/projects/Llama/utils/eval_adapter.py index 717107653..954c47df4 100644 --- a/projects/Llama/utils/eval_adapter.py +++ b/projects/Llama/utils/eval_adapter.py @@ -150,7 +150,7 @@ def run_eval_harness( parallel_config = DictConfig( dict( data_parallel_size=1, - tensor_parallel_size=1, + tensor_parallel_size=8, pipeline_parallel_size=1, pipeline_num_layers=32, device_type="cuda", diff --git a/projects/Llama/utils/prepare_alpaca.py b/projects/Llama/utils/prepare_alpaca.py deleted file mode 100644 index 2efa51266..000000000 --- a/projects/Llama/utils/prepare_alpaca.py +++ /dev/null @@ -1,164 +0,0 @@ -"""Implementation derived from https://github.com/tloen/alpaca-lora""" -import json -import math -import os -from pathlib import Path -from typing import Optional - -import oneflow as flow -import requests -from oneflow.utils.data import random_split -from tqdm import tqdm - -from libai.config import instantiate -from libai.utils.logger import setup_logger -from projects.Llama.configs.llama_config import tokenization - -logger = setup_logger() - - -def prepare( - destination_path: Path = Path("/data/home/xiezipeng/libai/projects/Llama/alpaca_data"), - checkpoint_dir: Path = Path("/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/"), - test_split_fraction: float = 0.03865, # to get exactly 2000 test samples, - seed: int = 42, - mask_inputs: bool = False, # as in alpaca-lora - data_file_name: str = "alpaca_data_cleaned_archive.json", - data_file_url: str = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json", # noqa - ignore_index: int = -1, - max_seq_length: Optional[int] = None, -) -> None: - """Prepare the Alpaca dataset for instruction tuning. - - The output is a training and test dataset saved as `train.pt` and `test.pt`, - which stores the preprocessed and tokenized prompts and labels. - """ - if max_seq_length is None: - with open(os.path.join(checkpoint_dir, "config.json"), "r", encoding="utf-8") as file: - config = json.load(file) - max_seq_length = config["max_position_embeddings"] - - destination_path.mkdir(parents=True, exist_ok=True) - data_file_path = destination_path / data_file_name - logger.info("Loading data file...") - download_if_missing(data_file_path, data_file_url) - with open(data_file_path, "r", encoding="utf-8") as file: - data = json.load(file) - - logger.info("Loading tokenizer...") - tokenizer = instantiate(tokenization.tokenizer) - - # Partition the dataset into train and test - num_of_test_samples = math.floor(test_split_fraction * len(data)) - num_of_train_samples = len(data) - num_of_test_samples - train_set, test_set = random_split( - data, - [num_of_train_samples, num_of_test_samples], - generator=flow.Generator().manual_seed(seed), - ) - train_set, test_set = list(train_set), list(test_set) - - logger.info(f"train has {len(train_set):,} samples") - logger.info(f"test has {len(test_set):,} samples") - - logger.info("Processing train split ...") - train_set = [ - prepare_sample( - example=sample, - tokenizer=tokenizer, - max_length=max_seq_length, - mask_inputs=mask_inputs, - ignore_index=ignore_index, - ) - for sample in tqdm(train_set) - ] - flow.save(train_set, destination_path / "train") - - logger.info("Processing test split ...") - test_set = [ - prepare_sample( - example=sample, - tokenizer=tokenizer, - max_length=max_seq_length, - mask_inputs=mask_inputs, - ignore_index=ignore_index, - ) - for sample in tqdm(test_set) - ] - flow.save(test_set, destination_path / "test") - - max_length = max([i["input_ids"].shape[0] for i in train_set]) - logger.info("Max length of training dataset: {}".format(max_length)) - - -def download_if_missing(file_path: Path, file_url: str) -> None: - """Downloads the raw json data file and saves it in the given destination.""" - if file_path.exists() and file_path.stat().st_size > 0: - return - with open(file_path, "w", encoding="utf-8") as f: - f.write(requests.get(file_url).text) - - -def prepare_sample( - example: dict, tokenizer, max_length: int, mask_inputs: bool, ignore_index: int -) -> dict: - """Processes a single sample. - - Each sample in the dataset consists of: - - instruction: A string describing the task - - input: A string holding a special input value for the instruction. - This only applies to some samples, and in others this is empty. - - output: The response string - - This function processes this data to produce a prompt text and a label for - supervised training. The prompt text is formed as a single message including both - the instruction and the input. The label/target is the same message but with the - response attached. - - Finally, both the prompt and the label get tokenized. If desired, all tokens - in the label that correspond to the original input prompt get masked out (default). - """ - full_prompt = generate_prompt(example) - full_prompt_and_response = full_prompt + example["output"] - - encoded_full_prompt = tokenizer.tokenize( - full_prompt, max_length=max_length, device="cpu" - ).squeeze(0) - encoded_full_prompt_and_response = tokenizer.tokenize( - full_prompt_and_response, add_eos=True, max_length=max_length, device="cpu" - ).squeeze(0) - - # The labels are the full prompt with response, but with the prompt masked out - labels = encoded_full_prompt_and_response.clone() - encoded_full_prompt_and_response = encoded_full_prompt_and_response[:-1] - labels = labels[1:] - if mask_inputs: - labels[: len(encoded_full_prompt)] = ignore_index - - return { - **example, - "input_ids": encoded_full_prompt_and_response, - "input_ids_no_response": encoded_full_prompt, - "labels": labels, - } - - -def generate_prompt(example: dict) -> str: - """Generates a standardized message to prompt the model with an instruction, optional input and a - 'response' field.""" - - if example["input"]: - return ( - "Below is an instruction that describes a task, paired with an input that provides further context. " # noqa - "Write a response that appropriately completes the request.\n\n" - f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:" # noqa - ) - return ( - "Below is an instruction that describes a task. " - "Write a response that appropriately completes the request.\n\n" - f"### Instruction:\n{example['instruction']}\n\n### Response:" - ) - - -if __name__ == "__main__": - prepare() From 4c796221165ec55866800425ad8e79f41741c7c0 Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Tue, 9 Jan 2024 17:33:38 +0800 Subject: [PATCH 5/9] refine --- projects/Llama/adapter/adapter_config.py | 4 ++-- projects/Llama/adapter/adapter_sft.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/Llama/adapter/adapter_config.py b/projects/Llama/adapter/adapter_config.py index 37d67c741..b164cf1bc 100644 --- a/projects/Llama/adapter/adapter_config.py +++ b/projects/Llama/adapter/adapter_config.py @@ -50,7 +50,7 @@ adapter_len=10, adapter_layer=30, # train - pretrained_model_path="/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/", + pretrained_model_path="Llama-2-7b-hf/", ) cfg = DictConfig(cfg) @@ -59,5 +59,5 @@ tokenization = OmegaConf.create() tokenization.make_vocab_size_divisible_by = 1 tokenization.tokenizer = LazyCall(LlamaTokenizer)( - pretrained_model_path="/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/tokenizer.model" + pretrained_model_path="Llama-2-7b-hf/tokenizer.model" ) diff --git a/projects/Llama/adapter/adapter_sft.py b/projects/Llama/adapter/adapter_sft.py index 650f606c3..08bcc950f 100644 --- a/projects/Llama/adapter/adapter_sft.py +++ b/projects/Llama/adapter/adapter_sft.py @@ -18,8 +18,8 @@ weight_decay = 0.1 learning_rate = 2e-5 max_input_length = 512 -dataset_path = "/data/home/xiezipeng/datasets/alpaca_data.json" -pretrained_model_path = "/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf/" +dataset_path = "alpaca_data.json" +pretrained_model_path = "Llama-2-7b-hf/" # graph & optim graph["enabled"] = False From 449401e90f24a018297d393cb3fae4c3d7998fc2 Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Tue, 9 Jan 2024 23:06:51 +0800 Subject: [PATCH 6/9] reformat --- projects/Llama/configs/llama_sft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/Llama/configs/llama_sft.py b/projects/Llama/configs/llama_sft.py index 14ecdc281..f04e38c43 100644 --- a/projects/Llama/configs/llama_sft.py +++ b/projects/Llama/configs/llama_sft.py @@ -74,7 +74,7 @@ train_epoch=1, train_iter=1, log_period=10, - warmup_ratio=2/5, + warmup_ratio=2 / 5, num_accumulation_steps=8, rdma_enabled=False, amp=dict(enabled=True), From 4172adcfcad76af6dd0e545d9b512e0530e41215 Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Thu, 11 Jan 2024 14:34:43 +0800 Subject: [PATCH 7/9] refine --- projects/Llama/adapter/adapter_config.py | 2 +- projects/Llama/adapter/adapter_sft.py | 18 +-- projects/Llama/configs/llama_config.py | 4 +- projects/Llama/configs/llama_sft.py | 25 ++-- projects/Llama/dataset.py | 54 +------- projects/Llama/pipeline.py | 4 +- projects/Llama/readme.md | 13 +- projects/Llama/utils/prepare_alpaca.py | 161 +++++++++++++++++++++++ 8 files changed, 197 insertions(+), 84 deletions(-) create mode 100644 projects/Llama/utils/prepare_alpaca.py diff --git a/projects/Llama/adapter/adapter_config.py b/projects/Llama/adapter/adapter_config.py index b164cf1bc..e2e6def85 100644 --- a/projects/Llama/adapter/adapter_config.py +++ b/projects/Llama/adapter/adapter_config.py @@ -50,7 +50,7 @@ adapter_len=10, adapter_layer=30, # train - pretrained_model_path="Llama-2-7b-hf/", + pretrained_model_path="meta-llama/Llama-2-7b-hf/", ) cfg = DictConfig(cfg) diff --git a/projects/Llama/adapter/adapter_sft.py b/projects/Llama/adapter/adapter_sft.py index 08bcc950f..e95e012bb 100644 --- a/projects/Llama/adapter/adapter_sft.py +++ b/projects/Llama/adapter/adapter_sft.py @@ -18,8 +18,8 @@ weight_decay = 0.1 learning_rate = 2e-5 max_input_length = 512 -dataset_path = "alpaca_data.json" -pretrained_model_path = "Llama-2-7b-hf/" +dataset_path = "alpaca_data" +pretrained_model_path = "meta-llama/Llama-2-7b-hf" # graph & optim graph["enabled"] = False @@ -46,20 +46,14 @@ dataloader.train = LazyCall(build_nlp_train_loader)( dataset=[ LazyCall(AlpacaDataset)( - path=dataset_path, - tokenizer=tokenization.tokenizer, - max_len=max_input_length, - partition="train", + path=os.path.join(dataset_path, "train"), tokenizer=tokenization.tokenizer ) ], ) dataloader.test = [ LazyCall(build_nlp_test_loader)( dataset=LazyCall(AlpacaDataset)( - path=dataset_path, - tokenizer=tokenization.tokenizer, - max_len=max_input_length, - partition="test", + path=os.path.join(dataset_path, "test"), tokenizer=tokenization.tokenizer ), ), ] @@ -70,7 +64,7 @@ output_dir="./sft_result", train_micro_batch_size=8, test_micro_batch_size=1, - train_epoch=5, + train_epoch=3, train_iter=1, log_period=10, warmup_ratio=2 / 5, @@ -91,7 +85,7 @@ evaluation=dict( enabled=True, evaluator=LazyCall(PPLEvaluator)(), - eval_period=100, + eval_period=1000, eval_iter=100, ), scheduler=LazyCall(WarmupExponentialLR)( diff --git a/projects/Llama/configs/llama_config.py b/projects/Llama/configs/llama_config.py index bb810c53f..58b86ecd6 100644 --- a/projects/Llama/configs/llama_config.py +++ b/projects/Llama/configs/llama_config.py @@ -48,7 +48,7 @@ eos_token_id=2, pad_token_id=0, # train - pretrained_model_path="Llama-2-7b-hf", + pretrained_model_path="meta-llama/Llama-2-7b-hf", ) cfg = DictConfig(cfg) @@ -57,5 +57,5 @@ tokenization = OmegaConf.create() tokenization.make_vocab_size_divisible_by = 1 tokenization.tokenizer = LazyCall(LlamaTokenizer)( - pretrained_model_path="Llama-2-7b-hf/tokenizer.model" + pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model" ) diff --git a/projects/Llama/configs/llama_sft.py b/projects/Llama/configs/llama_sft.py index f04e38c43..e767d84d7 100644 --- a/projects/Llama/configs/llama_sft.py +++ b/projects/Llama/configs/llama_sft.py @@ -19,9 +19,8 @@ # Hyperparameters weight_decay = 0.1 learning_rate = 5e-5 -max_input_length = 512 -dataset_path = "alpaca_data.json" -pretrained_model_path = "Llama-2-7b-hf" +dataset_path = "alpaca_data" +pretrained_model_path = "meta-llama/Llama-2-7b-hf" # graph & optim graph["enabled"] = False @@ -47,20 +46,14 @@ dataloader.train = LazyCall(build_nlp_train_loader)( dataset=[ LazyCall(AlpacaDataset)( - path=dataset_path, - tokenizer=tokenization.tokenizer, - max_len=max_input_length, - partition="train", + path=os.path.join(dataset_path, "train"), tokenizer=tokenization.tokenizer ) ], ) dataloader.test = [ LazyCall(build_nlp_test_loader)( dataset=LazyCall(AlpacaDataset)( - path=dataset_path, - tokenizer=tokenization.tokenizer, - max_len=max_input_length, - partition="test", + path=os.path.join(dataset_path, "test"), tokenizer=tokenization.tokenizer ), ), ] @@ -71,22 +64,22 @@ output_dir="./sft_result", train_micro_batch_size=4, test_micro_batch_size=1, - train_epoch=1, + train_epoch=3, train_iter=1, log_period=10, - warmup_ratio=2 / 5, + warmup_ratio=1 / 3, num_accumulation_steps=8, rdma_enabled=False, amp=dict(enabled=True), activation_checkpoint=dict(enabled=True), checkpointer=dict( - period=1000, + period=5000, max_to_keep=20, ), dist=dict( data_parallel_size=1, - tensor_parallel_size=8, - pipeline_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=8, pipeline_num_layers=cfg.hidden_layers, ), evaluation=dict( diff --git a/projects/Llama/dataset.py b/projects/Llama/dataset.py index 701a98556..d78efe9fe 100644 --- a/projects/Llama/dataset.py +++ b/projects/Llama/dataset.py @@ -1,65 +1,19 @@ -import copy -import json - import oneflow as flow from oneflow.utils.data import Dataset from libai.data.structures import DistTensorData, Instance -PROMPT_DICT = { - "prompt_input": ( - "Below is an instruction that describes a task, paired with an input that provides further context. " - "Write a response that appropriately completes the request.\n\n" - "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" - ), - "prompt_no_input": ( - "Below is an instruction that describes a task. " - "Write a response that appropriately completes the request.\n\n" - "### Instruction:\n{instruction}\n\n### Response:" - ), -} - class AlpacaDataset(Dataset): - def __init__(self, path, tokenizer, max_len=513, partition="train"): - self.data = json.load(open(path)) + def __init__(self, path, tokenizer): + self.data = flow.load(path) self.tokenizer = tokenizer - self.max_len = max_len - if partition == "train": - self.data = self.data - else: - self.data = self.data[:200] def __len__(self): return len(self.data) def __getitem__(self, index): - data = self.data[index] - if data.get("input", "") == "": - prompt = PROMPT_DICT["prompt_no_input"].format_map(data) - else: - prompt = PROMPT_DICT["prompt_input"].format_map(data) - example = prompt + data["output"] - prompt = self.tokenizer.tokenize(prompt, add_bos=True, add_eos=False, device="cpu")[0] - example = self.tokenizer.tokenize(example, add_bos=True, add_eos=True, device="cpu")[0] - padding = self.max_len - example.shape[0] - if padding > 0: - example = flow.cat((example, flow.zeros(padding, dtype=flow.long) - 1)) - elif padding < 0: - example = example[: self.max_len] - labels = copy.deepcopy(example) - labels[: len(prompt)] = -1 - example_mask = example.ge(0) - label_mask = labels.ge(0) - example[~example_mask] = 0 - labels[~label_mask] = -1 - example = example[:-1] - labels = labels[1:] - example_mask = flow.where( - example_mask, flow.tensor(0, dtype=flow.float), flow.tensor(-float("inf")) - ) - example_mask = example_mask[:-1] return Instance( - input_ids=DistTensorData(example), - labels=DistTensorData(labels), + input_ids=DistTensorData(self.data[index]["input_ids"]), + labels=DistTensorData(self.data[index]["labels"]), ) diff --git a/projects/Llama/pipeline.py b/projects/Llama/pipeline.py index 0b936da67..bea4a2f56 100644 --- a/projects/Llama/pipeline.py +++ b/projects/Llama/pipeline.py @@ -114,7 +114,9 @@ def postprocess(self, model_output_dict, **kwargs) -> dict: mode="libai", ) - text = ["a dog is flying on the sky", "Wikipedia is a free online", "what is beam search?"] + text = [ + "Give three tips for staying healthy.", + ] output = pipeline(inputs=text) if dist.is_main_process(): print(output) diff --git a/projects/Llama/readme.md b/projects/Llama/readme.md index d205796b4..9adb3d925 100644 --- a/projects/Llama/readme.md +++ b/projects/Llama/readme.md @@ -10,7 +10,13 @@ FineTuning llama2 on 8 GPUs using parallelism. ### 1. Prepare the alpaca dataset -> Alpaca Dataset address: https://huggingface.co/datasets/vicgalle/alpaca-gpt4 +> set the parameters in `projects/Llama/utils/prepare_alpaca.py` for prepare the datasets, such as `destination_path` and `checkpoint_dir`. + +> Get the alpaca dataset files by running: +```python3 +# path/to/libai +python projects/Llama/utils/prepare_alpaca.py +``` ### 2. Prepare your finetuning config file @@ -18,8 +24,11 @@ FineTuning llama2 on 8 GPUs using parallelism. ### 3. Run the following code to start SFT ```bash -# cd /path/to/libai +# full finetune bash tools/train.sh projects/Llama/train_net.py projects/Llama/configs/llama_sft.py 8 + +# adapter finetune +bash tools/train.sh projects/Llama/adapter/train_net.py projects/Llama/adapter/adapter_sft.py 8 ``` ## Evaluate diff --git a/projects/Llama/utils/prepare_alpaca.py b/projects/Llama/utils/prepare_alpaca.py new file mode 100644 index 000000000..590873404 --- /dev/null +++ b/projects/Llama/utils/prepare_alpaca.py @@ -0,0 +1,161 @@ +"""Implementation derived from https://github.com/tloen/alpaca-lora""" +import copy +import json +import math +import os +from pathlib import Path +from typing import Optional + +import oneflow as flow +import requests +from oneflow.utils.data import random_split +from tqdm import tqdm + +from libai.config import instantiate +from libai.utils.logger import setup_logger +from projects.Llama.configs.llama_config import tokenization + +logger = setup_logger() + + +def prepare( + destination_path: Path = Path("/data/home/xiezipeng/datasets/alpaca_data"), + checkpoint_dir: Path = Path("/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf"), + test_split_fraction: float = 0.03865, # to get exactly 2000 test samples, + seed: int = 42, + mask_inputs: bool = False, # as in alpaca-lora + data_file_name: str = "alpaca_data_cleaned_archive.json", + data_file_url: str = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json", # noqa + ignore_index: int = -1, + max_seq_length: Optional[int] = 512, +) -> None: + """Prepare the Alpaca dataset for instruction tuning. + The output is a training and test dataset saved as `train.pt` and `test.pt`, + which stores the preprocessed and tokenized prompts and labels. + """ + if max_seq_length is None: + with open(os.path.join(checkpoint_dir, "config.json"), "r", encoding="utf-8") as file: + config = json.load(file) + max_seq_length = config["max_position_embeddings"] + + destination_path.mkdir(parents=True, exist_ok=True) + data_file_path = destination_path / data_file_name + logger.info("Loading data file...") + download_if_missing(data_file_path, data_file_url) + with open(data_file_path, "r", encoding="utf-8") as file: + data = json.load(file) + + logger.info("Loading tokenizer...") + tokenizer = instantiate(tokenization.tokenizer) + + # Partition the dataset into train and test + num_of_test_samples = math.floor(test_split_fraction * len(data)) + num_of_train_samples = len(data) - num_of_test_samples + train_set, test_set = random_split( + data, + [num_of_train_samples, num_of_test_samples], + generator=flow.Generator().manual_seed(seed), + ) + train_set, test_set = list(train_set), list(test_set) + + logger.info(f"train has {len(train_set):,} samples") + logger.info(f"test has {len(test_set):,} samples") + + logger.info("Processing train split ...") + train_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + ) + for sample in tqdm(train_set) + ] + flow.save(train_set, destination_path / "train") + + logger.info("Processing test split ...") + test_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + ) + for sample in tqdm(test_set) + ] + flow.save(test_set, destination_path / "test") + + max_length = max([i["input_ids"].shape[0] for i in train_set]) + logger.info("Max length of training dataset: {}".format(max_length)) + + +def download_if_missing(file_path: Path, file_url: str) -> None: + """Downloads the raw json data file and saves it in the given destination.""" + if file_path.exists() and file_path.stat().st_size > 0: + return + with open(file_path, "w", encoding="utf-8") as f: + f.write(requests.get(file_url).text) + + +def prepare_sample(example: dict, tokenizer, max_length: int) -> dict: + """Processes a single sample. + Each sample in the dataset consists of: + - instruction: A string describing the task + - input: A string holding a special input value for the instruction. + This only applies to some samples, and in others this is empty. + - output: The response string + This function processes this data to produce a prompt text and a label for + supervised training. The prompt text is formed as a single message including both + the instruction and the input. The label/target is the same message but with the + response attached. + Finally, both the prompt and the label get tokenized. If desired, all tokens + in the label that correspond to the original input prompt get masked out (default). + """ + full_prompt = generate_prompt(example) + full_prompt_and_response = full_prompt + example["output"] + + prompt = tokenizer.tokenize(full_prompt, add_bos=True, add_eos=False, device="cpu")[0] + example = tokenizer.tokenize( + full_prompt_and_response, add_bos=True, add_eos=True, device="cpu" + )[0] + + padding = max_length - example.shape[0] + if padding > 0: + example = flow.cat((example, flow.zeros(padding, dtype=flow.long) - 1)) + elif padding < 0: + example = example[:max_length] + labels = copy.deepcopy(example) + labels[: len(prompt)] = -1 + example_mask = example.ge(0) + label_mask = labels.ge(0) + example[~example_mask] = 0 + labels[~label_mask] = -1 + example = example[:-1] + labels = labels[1:] + example_mask = flow.where( + example_mask, flow.tensor(0, dtype=flow.float), flow.tensor(-float("inf")) + ) + example_mask = example_mask[:-1] + return { + "input_ids": example, + "labels": labels, + } + + +def generate_prompt(example: dict) -> str: + """Generates a standardized message to prompt the model with an instruction, optional input and a + 'response' field.""" + + if example["input"]: + return ( + "Below is an instruction that describes a task, paired with an input that provides further context. " # noqa + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:" # noqa + ) + return ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Response:" + ) + + +if __name__ == "__main__": + prepare() From 25c46d5671309941d55b4c1e38816b2cbb6e0f99 Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Thu, 11 Jan 2024 14:40:32 +0800 Subject: [PATCH 8/9] reformat --- projects/Llama/adapter/adapter_config.py | 2 +- projects/Llama/llama.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/Llama/adapter/adapter_config.py b/projects/Llama/adapter/adapter_config.py index e2e6def85..7381e64af 100644 --- a/projects/Llama/adapter/adapter_config.py +++ b/projects/Llama/adapter/adapter_config.py @@ -1,6 +1,6 @@ from omegaconf import DictConfig, OmegaConf -from configs.common.train import train +from configs.common.train import train # noqa from libai.config import LazyCall from projects.Llama.adapter.adapter_model import LlamaForCausalLM from projects.Llama.tokenizer import LlamaTokenizer diff --git a/projects/Llama/llama.py b/projects/Llama/llama.py index 83f56b731..d3198c464 100644 --- a/projects/Llama/llama.py +++ b/projects/Llama/llama.py @@ -21,7 +21,7 @@ from libai.config import configurable from libai.inference.generator.generation_utils import Generator -from libai.layers import Linear, ParallelCrossEntropyLoss, RMSLayerNorm, VocabEmbedding +from libai.layers import Linear, RMSLayerNorm, VocabEmbedding from libai.layers.attention import AttnMaskType from libai.models.utils import init_method_normal, scaled_init_method_normal from libai.utils import distributed as dist From d4a0073265f6f3fd5c96793abe5d6aa36c87b1dd Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Mon, 15 Jan 2024 20:53:16 +0800 Subject: [PATCH 9/9] update path --- projects/Llama/utils/prepare_alpaca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/Llama/utils/prepare_alpaca.py b/projects/Llama/utils/prepare_alpaca.py index 590873404..c21f505fb 100644 --- a/projects/Llama/utils/prepare_alpaca.py +++ b/projects/Llama/utils/prepare_alpaca.py @@ -19,8 +19,8 @@ def prepare( - destination_path: Path = Path("/data/home/xiezipeng/datasets/alpaca_data"), - checkpoint_dir: Path = Path("/data/home/xiezipeng/hf_models/meta-llama/Llama-2-7b-hf"), + destination_path: Path = Path("alpaca_data"), + checkpoint_dir: Path = Path("meta-llama/Llama-2-7b-hf"), test_split_fraction: float = 0.03865, # to get exactly 2000 test samples, seed: int = 42, mask_inputs: bool = False, # as in alpaca-lora