jirong commited on Feb 9, 2025

Commit

ee3e701

verified ·

1 Parent(s): 225894b

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +14 -0
InternLM/__init__.py +0 -0
InternLM/configs/kd_1b_to_300m.py +208 -0
InternLM/configs/pretrain_300m.py +168 -0
InternLM/internlm/__init__.py +10 -0
InternLM/internlm/apis/__init__.py +0 -0
InternLM/internlm/apis/inference.py +848 -0
InternLM/internlm/core/__init__.py +9 -0
InternLM/internlm/core/communication/__init__.py +32 -0
InternLM/internlm/core/communication/p2p.py +582 -0
InternLM/internlm/core/communication/utils.py +125 -0
InternLM/internlm/core/context/__init__.py +49 -0
InternLM/internlm/core/context/parallel_context.py +569 -0
InternLM/internlm/core/context/process_group_initializer.py +418 -0
InternLM/internlm/core/context/random.py +131 -0
InternLM/internlm/core/engine.py +227 -0
InternLM/internlm/core/gradient_handler.py +76 -0
InternLM/internlm/core/naive_amp.py +136 -0
InternLM/internlm/core/scheduler/__init__.py +14 -0
InternLM/internlm/core/scheduler/base_scheduler.py +187 -0
InternLM/internlm/core/scheduler/no_pipeline_scheduler.py +266 -0
InternLM/internlm/core/scheduler/pipeline_scheduler.py +1363 -0
InternLM/internlm/core/trainer.py +190 -0
InternLM/internlm/data/__init__.py +13 -0
InternLM/internlm/data/batch_sampler.py +354 -0
InternLM/internlm/data/collaters.py +88 -0
InternLM/internlm/data/dataset.py +56 -0
InternLM/internlm/data/dummy_dataset.py +44 -0
InternLM/internlm/data/packed_dataset.py +421 -0
InternLM/internlm/data/single_dataset.py +117 -0
InternLM/internlm/data/utils.py +46 -0
InternLM/internlm/initialize/__init__.py +16 -0
InternLM/internlm/initialize/initialize_tensor.py +63 -0
InternLM/internlm/initialize/initialize_trainer.py +235 -0
InternLM/internlm/initialize/launch.py +511 -0
InternLM/internlm/initialize/legacy/__init__.py +0 -0
InternLM/internlm/initialize/legacy/launch.py +40 -0
InternLM/internlm/model/__init__.py +23 -0
InternLM/internlm/model/embedding.py +273 -0
InternLM/internlm/model/linear.py +201 -0
InternLM/internlm/model/loss.py +81 -0
InternLM/internlm/model/metrics.py +263 -0
InternLM/internlm/model/modeling_internlm.py +524 -0
InternLM/internlm/model/modeling_vit.py +527 -0
InternLM/internlm/model/multi_head_attention.py +186 -0
InternLM/internlm/model/muse/__init__.py +18 -0
InternLM/internlm/model/muse/modeling_taming_vqgan.py +591 -0
InternLM/internlm/model/muse/modeling_utils.py +1171 -0
InternLM/internlm/model/norm.py +46 -0
InternLM/internlm/model/utils.py +224 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/derain_prompt/000000_img.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/derain_prompt/000000_label.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/derain_prompt/000001_img.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/derain_prompt/000001_label.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/derain_prompt/000002_img.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/derain_prompt/000002_label.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/examples/derain_1.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/examples/derain_2.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/examples/pose_2.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/examples/seg_1.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/examples/seg_2.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/pose_prompt/000002_img.png filter=lfs diff=lfs merge=lfs -text
+InternLM/tools/data/seg_prompt/000000_img.png filter=lfs diff=lfs merge=lfs -text
+figs/DeLVM.PNG filter=lfs diff=lfs merge=lfs -text

InternLM/__init__.py ADDED Viewed

File without changes

InternLM/configs/kd_1b_to_300m.py ADDED Viewed

	@@ -0,0 +1,208 @@

+kd_config = dict(gt_weight=1., kd_weight=1., temperature=1)
+teacher_type = "INTERNLM"
+teacher_ckpt_folder = '/path/to/teacher'
+VQGAN_FOLDER = '/path/to/vqgan'
+T_SEQ_LEN = 2048
+T_HIDDEN_SIZE = 2048
+T_NUM_ATTENTION_HEAD = 16
+T_MLP_RATIO = 8 / 3
+T_NUM_LAYER = 22
+T_VOCAB_SIZE = 8192
+teacher = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=T_NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=T_VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=T_HIDDEN_SIZE,
+    num_layers=T_NUM_LAYER,
+    mlp_ratio=T_MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    lvm_config=dict(
+        enable=True,
+        embedding_cfg=dict(
+            vq_model_path=VQGAN_FOLDER,
+            embedding_dim=T_HIDDEN_SIZE,
+            freeze_vq_model=True,
+        ),
+    )
+)
+########################################################
+JOB_NAME = "lvm_llama_kd"
+DO_ALERT = False
+model_type = "INTERNLM"
+SEQ_LEN = 2048
+HIDDEN_SIZE = 1024
+NUM_ATTENTION_HEAD = 8
+MLP_RATIO = 8 / 3
+NUM_LAYER = 22
+VOCAB_SIZE = 8192
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+SAVE_CKPT_FOLDER = "local:/path_to_save/"
+LOAD_CKPT_FOLDER = "local:/path_to_load/"
+CHECKPOINT_EVERY = 10000
+ckpt = dict(
+    enable_save_ckpt=True,  # set True to enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["all"], ckpt_type="normal"),
+    # load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    # load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    # oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+    oss_snapshot_freq=0,
+)
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=16,
+    # defaults to the value of micro_num
+    valid_micro_num=1,
+    # defaults to 0, means disable evaluate
+    valid_every=0,
+    pack_sample_into_one=False,
+    train_one_epoch=False,
+    total_steps=40000,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    train_folder=TRAIN_FOLDER,
+    valid_folder=None,
+    empty_cache_and_diag_interval=10000,
+    diag_outlier_ratio=1.1,
+)
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+loss = dict(
+    label_smoothing=0,
+)
+adam = dict(
+    lr=1.5e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.1,
+)
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.0056,
+    eta_min=1.5e-5,
+    last_epoch=-1,
+)
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    lvm_config=dict(
+        enable=True,
+        embedding_cfg=dict(
+            vq_model_path='/cache/ckpt/vqgan-f16-8192-laion/',
+            embedding_dim=HIDDEN_SIZE,
+            freeze_vq_model=True,
+        ),
+    )
+)
+"""
+zero1 parallel:
+    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+        so parameters will be divided within the range of dp.
+    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+tensor parallel: tensor parallel size, usually the number of GPUs per node.
+"""
+parallel = dict(
+    zero1=8,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
+)
+cudnn_deterministic = False
+cudnn_benchmark = False
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)

InternLM/configs/pretrain_300m.py ADDED Viewed

	@@ -0,0 +1,168 @@

+JOB_NAME = "lvm_llama"
+DO_ALERT = False
+model_type = "INTERNLM"
+SEQ_LEN = 2048
+HIDDEN_SIZE = 1024
+NUM_ATTENTION_HEAD = 8
+MLP_RATIO = 8 / 3
+NUM_LAYER = 22
+VOCAB_SIZE = 8192
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+SAVE_CKPT_FOLDER = "local:/path_to_save/"
+LOAD_CKPT_FOLDER = "local:/path_to_load/"
+CHECKPOINT_EVERY = 10000
+ckpt = dict(
+    enable_save_ckpt=True,  # set True to enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["all"], ckpt_type="normal"),
+    # load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    # load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    # oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+    oss_snapshot_freq=0,
+)
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=16,
+    # defaults to the value of micro_num
+    valid_micro_num=1,
+    # defaults to 0, means disable evaluate
+    valid_every=0,
+    pack_sample_into_one=False,
+    train_one_epoch=False,
+    total_steps=40000,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    train_folder=TRAIN_FOLDER,
+    valid_folder=None,
+    empty_cache_and_diag_interval=10000,
+    diag_outlier_ratio=1.1,
+)
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+loss = dict(
+    label_smoothing=0,
+)
+adam = dict(
+    lr=1.5e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.1,
+)
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.0056,
+    eta_min=1.5e-5,
+    last_epoch=-1,
+)
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    lvm_config=dict(
+        enable=True,
+        embedding_cfg=dict(
+            vq_model_path='/cache/ckpt/vqgan-f16-8192-laion/',
+            embedding_dim=HIDDEN_SIZE,
+            freeze_vq_model=True,
+        ),
+    )
+)
+"""
+zero1 parallel:
+    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+        so parameters will be divided within the range of dp.
+    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+tensor parallel: tensor parallel size, usually the number of GPUs per node.
+"""
+parallel = dict(
+    zero1=8,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
+)
+cudnn_deterministic = False
+cudnn_benchmark = False
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)

InternLM/internlm/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .initialize.initialize_trainer import initialize_trainer, initialize_kd_trainer
+from .initialize.launch import get_default_parser, launch_from_slurm, launch_from_torch
+__all__ = [
+    "get_default_parser",
+    "initialize_kd_trainer",
+    "initialize_trainer",
+    "launch_from_slurm",
+    "launch_from_torch",
+]

InternLM/internlm/apis/__init__.py ADDED Viewed

File without changes

InternLM/internlm/apis/inference.py ADDED Viewed

	@@ -0,0 +1,848 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import torch
+import torch.nn.functional as F
+from torch import nn
+__all__ = ["SequenceGenerator"]
+class InferenceParams:
+    """
+    Intermediate cache objects for inference
+    """
+    def __init__(
+        self,
+        max_sequence_len,
+        max_batch_size,
+        sequence_len_offset=0,
+        batch_size_offset=0,
+        key_value_memory_dict: dict = None,
+        lengths_per_sample=None,
+        attention_mask=None,
+    ) -> None:
+        self.max_sequence_len: int = max_sequence_len
+        self.max_batch_size: int = max_batch_size
+        self.sequence_len_offset: int = sequence_len_offset
+        self.batch_size_offset: int = batch_size_offset
+        if key_value_memory_dict is None:
+            key_value_memory_dict = {}
+        self.key_value_memory_dict: dict = key_value_memory_dict
+        self.fused_ft_kernel: bool = False
+        self.lengths_per_sample = lengths_per_sample
+        self.attention_mask = attention_mask
+    def reorder_state(self, indices):
+        if self.lengths_per_sample is not None:
+            self.lengths_per_sample = self.lengths_per_sample.index_select(index=indices, dim=0)
+        for key, value in list(self.key_value_memory_dict.items()):
+            value = value.index_select(index=indices, dim=0)
+            self.key_value_memory_dict[key] = value
+def _get_model_device(model):
+    """
+    obtain the device of an nn.Module.model
+    Args:
+        model: nn.Module
+    Return: torch.device. if None, the parameters of this model is None.
+    """
+    assert isinstance(model, nn.Module)
+    parameters = list(model.parameters())
+    if len(parameters) == 0:
+        return None
+    else:
+        return parameters[0].device
+class SequenceGenerator:
+    """
+    Sequence Generator.
+    """
+    def __init__(self, decoder, eos_token_id, pad_token_id, bos_token_id):
+        self.decoder = decoder
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+    @torch.no_grad()
+    def generate(
+        self,
+        tokens: "torch.LongTensor" = None,
+        num_return_sequences=1,
+        max_length: int = 20,
+        num_beams: int = 1,
+        do_sample: bool = True,
+        temperature: float = 1.0,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        repetition_penalty: float = 1,
+        length_penalty: float = 1.0,
+    ):
+        """
+        Args:
+            tokens: the beginning tokens whose shape is [bsz, length]. If shape is None, default ''bos_token'' will be
+                added to conduct generation.
+            num_return_sequences: number of returned sequences.
+            max_length: the max length of generated sequence.
+            num_beams: the size of beam search.
+            do_sample: whether using sample.
+            temperature: it's meaningful when do_sample is True.
+            top_k: sampling from top_k.
+            top_p: sampling from top_p tokens(nucleus sampling).
+        Return:
+            the token sequence whose shape is [bsz, num_return_sequences, max_length]. If eos_token_id is not None,
+                the ending of each sequence must be eos_token_id.
+        """
+        assert num_return_sequences <= num_beams, f"The `{num_return_sequences}` must be less than `{num_beams}`..."
+        if do_sample:
+            return sample_generate(
+                self.decoder,
+                tokens=tokens,
+                max_length=max_length,
+                num_beams=num_beams,
+                num_return_sequences=num_return_sequences,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                eos_token_id=self.eos_token_id,  # the ending token id
+                pad_token_id=self.pad_token_id,
+                repetition_penalty=repetition_penalty,  # the penalty degree for repetition tokens
+                length_penalty=length_penalty,  # the penalty for length. if it > 1, then encourages long sequence.
+                # Otherwise, encourages short sequence.
+                bos_token_id=self.bos_token_id,
+            )
+        else:
+            return greedy_generate(
+                self.decoder,
+                tokens=tokens,
+                max_length=max_length,
+                num_beams=num_beams,
+                num_return_sequences=num_return_sequences,
+                eos_token_id=self.eos_token_id,
+                pad_token_id=self.pad_token_id,
+                repetition_penalty=repetition_penalty,
+                length_penalty=length_penalty,
+                bos_token_id=self.bos_token_id,
+            )
+@torch.no_grad()
+def greedy_generate(
+    decoder,
+    tokens=None,
+    max_length=20,
+    num_beams=1,
+    num_return_sequences=1,
+    eos_token_id=None,
+    pad_token_id=0,
+    repetition_penalty=1,
+    length_penalty=1.0,
+    bos_token_id=1,
+    feat_mask=None,
+    ffn_mask=None,
+    layer_mask=None,
+):
+    """
+    Search sequence greedily.
+    Args:
+        decoder: the Decoder object.
+        tokens: the shape is [batch size, length]. If decoder is None, generating begins with bos_token_id.
+        max_length: the max length for generated sequence.
+        num_beams: the size of beam to decode.
+        eos_token_id: the ending token id. If None, the decode length is max_length.
+        pad_token_id: the token id of pad.
+        repetition_penalty: the penalty degree for repetition tokens
+        length_penalty: the penalty for length.
+    """
+    if num_beams == 1:
+        token_ids = _no_beam_search_generate(
+            decoder,
+            tokens=tokens,
+            max_length=max_length,
+            temperature=1,
+            top_k=50,
+            top_p=1,
+            eos_token_id=eos_token_id,
+            do_sample=False,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            feat_mask=feat_mask,
+            ffn_mask=ffn_mask,
+            layer_mask=layer_mask,
+        )
+    else:
+        token_ids = _beam_search_generate(
+            decoder,
+            tokens=tokens,
+            max_length=max_length,
+            num_beams=num_beams,
+            num_return_sequences=num_return_sequences,
+            temperature=1,
+            top_k=50,
+            top_p=1,
+            eos_token_id=eos_token_id,
+            do_sample=False,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            feat_mask=feat_mask,
+            ffn_mask=ffn_mask,
+            layer_mask=layer_mask,
+        )
+    return token_ids
+@torch.no_grad()
+def sample_generate(
+    decoder,
+    tokens,
+    max_length=20,
+    num_beams=1,
+    num_return_sequences=1,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    eos_token_id=None,
+    pad_token_id=0,
+    repetition_penalty=1.0,
+    length_penalty=1.0,
+    bos_token_id=1,
+):
+    """
+    generate sequence in sampling way.
+    Args:
+        decoder: the Decoder object.
+        tokens: the shape is [batch size, length]. If decoder is None, generating begins with bos_token_id.
+        max_length: the max length for generated sequence.
+        num_beams: the size of beam to decode.
+        num_return_sequences: number of returned sequence.
+        temperature: annealing magnitude during sampling.
+        top_k: sampling from top_k. (Default: 50)
+        top_p: sampling from top_p tokens(nucleus sampling). (Default: 1.0)
+        eos_token_id: the ending token id. If None, the decode length is max_length.
+        pad_token_id: the token id of pad.
+        repetition_penalty: the penalty degree for repetition tokens
+        length_penalty: the penalty for length.
+    """
+    if num_beams == 1:
+        token_ids = _no_beam_search_generate(
+            decoder,
+            tokens=tokens,
+            max_length=max_length,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            eos_token_id=eos_token_id,
+            do_sample=True,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+        )
+    else:
+        token_ids = _beam_search_generate(
+            decoder,
+            tokens=tokens,
+            max_length=max_length,
+            num_beams=num_beams,
+            num_return_sequences=num_return_sequences,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            eos_token_id=eos_token_id,
+            do_sample=True,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+        )
+    return token_ids
+@torch.no_grad()
+def _no_beam_search_generate(
+    decoder,
+    tokens,
+    inference_params=None,
+    max_length=20,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    eos_token_id=None,
+    do_sample=True,
+    repetition_penalty=1.0,
+    length_penalty=1.0,
+    pad_token_id=0,
+    bos_token_id=1,
+    feat_mask=None,
+    ffn_mask=None,
+    layer_mask=None,
+):
+    # delete num_return_sequences=1 for lint check;
+    batch_size = tokens.size(0)
+    if eos_token_id is None:
+        _eos_token_id = -1
+    else:
+        _eos_token_id = eos_token_id
+    has_bos = torch.all(tokens[:, 0].eq(bos_token_id))
+    if has_bos:
+        bos_pos = torch.where(tokens.eq(bos_token_id), 1, 0)
+        bos_sum = bos_pos.cumsum(dim=-1)
+        bos_pos = torch.where(bos_sum.eq(bos_sum[:, -1:]), 0, 1)
+        to_atten_x = bos_pos[:, :, None]
+        to_atten_y = bos_pos[:, None, :]
+        # attention_mask = torch.einsum('bno,bom->bnm', to_atten_x, to_atten_y).eq(1)
+    else:
+        bos_pos = torch.where(tokens.eq(bos_token_id), 1, 0)
+        to_atten_x = bos_pos[:, :, None]
+        to_atten_y = bos_pos[:, None, :]
+        # attention_mask = torch.einsum('bno,bom->bnm', to_atten_x, to_atten_y).eq(1)
+    attention_mask = torch.logical_or(to_atten_x, to_atten_y).eq(1)
+    if inference_params is None:
+        inference_params = InferenceParams(
+            max_sequence_len=max_length,
+            max_batch_size=tokens.size(0),
+            sequence_len_offset=0,
+            batch_size_offset=0,
+            key_value_memory_dict=None,
+            lengths_per_sample=None,
+            attention_mask=attention_mask,
+        )
+    if layer_mask is None:
+        if feat_mask is None and ffn_mask is None:
+            scores = decoder(**{"input_ids": tokens, "inference_params": inference_params})
+        else:
+            scores = decoder(
+                **{
+                    "input_ids": tokens,
+                    "inference_params": inference_params,
+                    "feat_mask": feat_mask,
+                    "ffn_mask": ffn_mask,
+                }
+            )
+    else:
+        scores = decoder(
+            **{
+                "input_ids": tokens,
+                "inference_params": inference_params,
+                "feat_mask": feat_mask,
+                "ffn_mask": ffn_mask,
+                "layer_mask": layer_mask,
+            }
+        )
+    if isinstance(scores, (list, tuple)):
+        scores = scores[0]
+    scores = scores[:, -1].float()
+    inference_params.sequence_len_offset += tokens.size(1)
+    if _eos_token_id != -1:
+        scores[:, _eos_token_id] = -1e12
+    next_tokens = scores.argmax(dim=-1, keepdim=True)
+    token_ids = torch.cat([tokens, next_tokens], dim=1)
+    cur_len = token_ids.size(1)
+    dones = token_ids.new_zeros(batch_size).eq(1)
+    # tokens = tokens[:, -1:]
+    real_max_length = max_length
+    max_lengths = tokens.new_full((tokens.size(0),), fill_value=max_length, dtype=torch.long)
+    while cur_len < real_max_length:
+        # batch_size x vocab_size
+        if has_bos:
+            bos_pos = torch.where(token_ids.eq(bos_token_id), 1, 0)
+            bos_sum = bos_pos.cumsum(dim=-1)
+            bos_pos = torch.where(bos_sum.eq(bos_sum[:, -1:]), 0, 1)
+            to_atten_x = bos_pos[:, :, None]
+            to_atten_y = bos_pos[:, None, :]
+            # attention_mask = torch.einsum('bno,bom->bnm', to_atten_x, to_atten_y).eq(1)
+        else:
+            bos_pos = torch.where(token_ids.eq(bos_token_id), 1, 0)
+            to_atten_x = bos_pos[:, :, None]
+            to_atten_y = bos_pos[:, None, :]
+            # attention_mask = torch.einsum('bno,bom->bnm', to_atten_x, to_atten_y).eq(1)
+        attention_mask = torch.logical_or(to_atten_x, to_atten_y).eq(1)
+        inference_params.attention_mask = attention_mask
+        if layer_mask is None:
+            if feat_mask is None and ffn_mask is None:
+                scores = decoder(**{"input_ids": token_ids[:, -1:], "inference_params": inference_params})
+            else:
+                scores = decoder(
+                    **{
+                        "input_ids": token_ids[:, -1:],
+                        "inference_params": inference_params,
+                        "feat_mask": feat_mask,
+                        "ffn_mask": ffn_mask,
+                    }
+                )
+        else:
+            scores = decoder(
+                **{
+                    "input_ids": token_ids[:, -1:],
+                    "inference_params": inference_params,
+                    "feat_mask": feat_mask,
+                    "ffn_mask": ffn_mask,
+                    "layer_mask": layer_mask,
+                }
+            )
+        if isinstance(scores, (list, tuple)):
+            scores = scores[0]
+        scores = scores[:, -1].float()
+        inference_params.sequence_len_offset += 1
+        if repetition_penalty != 1.0:
+            token_scores = scores.gather(dim=1, index=token_ids)
+            lt_zero_mask = token_scores.lt(0).float()
+            ge_zero_mask = lt_zero_mask.eq(0).float()
+            token_scores = (
+                lt_zero_mask * repetition_penalty * token_scores + ge_zero_mask / repetition_penalty * token_scores
+            )
+            scores.scatter_(dim=1, index=token_ids, src=token_scores)
+        if eos_token_id is not None and length_penalty != 1.0:
+            # batch_size x vocab_size
+            token_scores = scores / cur_len**length_penalty
+            eos_mask = scores.new_ones(scores.size(1))
+            eos_mask[eos_token_id] = 0
+            eos_mask = eos_mask.unsqueeze(0).eq(1)
+            scores = scores.masked_scatter(eos_mask, token_scores)
+        if do_sample:
+            if temperature > 0 and temperature != 1:
+                scores = scores / temperature
+            scores = top_k_top_p_filtering(scores, top_k, top_p, min_tokens_to_keep=2)
+            # add 1e-12 to avoid https://github.com/pytorch/pytorch/pull/27523
+            probs = F.softmax(scores, dim=-1) + 1e-12
+            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)  # batch_size
+        else:
+            next_tokens = torch.argmax(scores, dim=-1)  # batch_size
+        if _eos_token_id != -1:
+            next_tokens = next_tokens.masked_fill(max_lengths.eq(cur_len + 1), _eos_token_id)
+        next_tokens = next_tokens.masked_fill(dones, pad_token_id)
+        tokens = next_tokens.unsqueeze(1)
+        token_ids = torch.cat([token_ids, tokens], dim=-1)  # batch_size x max_len
+        end_mask = next_tokens.eq(_eos_token_id)
+        dones = dones.__or__(end_mask)
+        cur_len += 1
+        if dones.min() == 1:
+            break
+    # if eos_token_id is not None:
+    #     # setting the eos at the maximum length position
+    #     tokens.scatter(index=max_lengths[:, None], dim=1, value=eos_token_id)
+    # if cur_len == max_length:
+    #     # If eos is not reached by the maximum length, forcibly replace the last word with eos
+    #     token_ids[:, -1].masked_fill_(~dones, eos_token_id)
+    # TODO Here we are simply adding an extra dimension for interface compatibility, but in the future it will need to
+    # be able to return multiple real results
+    return token_ids[:, None]
+@torch.no_grad()
+def _beam_search_generate(
+    decoder,
+    tokens,
+    inference_params=None,
+    max_length=20,
+    num_beams=4,
+    num_return_sequences=1,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    eos_token_id=None,
+    do_sample=True,
+    repetition_penalty=1.0,
+    length_penalty=1.0,
+    pad_token_id=0,
+    bos_token_id=1,
+    feat_mask=None,
+    ffn_mask=None,
+    layer_mask=None,
+) -> torch.LongTensor:
+    device = _get_model_device(decoder)
+    batch_size = tokens.size(0)
+    if eos_token_id is None:
+        _eos_token_id = -1
+    else:
+        _eos_token_id = eos_token_id
+    has_bos = torch.all(tokens[:, 0].eq(bos_token_id))
+    if has_bos:
+        bos_pos = torch.where(tokens.eq(bos_token_id), 1, 0)
+        bos_sum = bos_pos.cumsum(dim=-1)
+        bos_pos = torch.where(bos_sum.eq(bos_sum[:, -1:]), 0, 1)
+        to_atten_x = bos_pos[:, :, None]
+        to_atten_y = bos_pos[:, None, :]
+        # attention_mask = torch.einsum('bno,bom->bnm', to_atten_x, to_atten_y).eq(1)
+    else:
+        bos_pos = torch.where(tokens.eq(bos_token_id), 1, 0)
+        to_atten_x = bos_pos[:, :, None]
+        to_atten_y = bos_pos[:, None, :]
+        # attention_mask = torch.einsum('bno,bom->bnm', to_atten_x, to_atten_y).eq(1)
+    attention_mask = torch.logical_or(to_atten_x, to_atten_y).eq(1)
+    if inference_params is None:
+        inference_params = InferenceParams(
+            max_sequence_len=max_length,
+            max_batch_size=tokens.size(0),
+            sequence_len_offset=0,
+            batch_size_offset=0,
+            key_value_memory_dict=None,
+            lengths_per_sample=None,
+            attention_mask=attention_mask,
+        )
+    if layer_mask is None:
+        if feat_mask is None and ffn_mask is None:
+            scores = decoder(**{"input_ids": tokens, "inference_params": inference_params})
+        else:
+            scores = decoder(
+                **{
+                    "input_ids": tokens,
+                    "inference_params": inference_params,
+                    "feat_mask": feat_mask,
+                    "ffn_mask": ffn_mask,
+                }
+            )
+    else:
+        scores = decoder(
+            **{
+                "input_ids": tokens,
+                "inference_params": inference_params,
+                "feat_mask": feat_mask,
+                "ffn_mask": ffn_mask,
+                "layer_mask": layer_mask,
+            }
+        )
+    if isinstance(scores, (list, tuple)):
+        scores = scores[0]
+    scores = scores[:, -1].float()
+    inference_params.sequence_len_offset += tokens.size(1)
+    if _eos_token_id != -1:
+        scores[:, _eos_token_id] = -1e12
+    vocab_size = scores.size(1)
+    assert vocab_size >= num_beams, "num_beams should be smaller than " "the number of vocabulary size."
+    if do_sample:
+        probs = F.softmax(scores, dim=-1) + 1e-12
+        # (batch_size, num_beams)
+        next_tokens = torch.multinomial(probs, num_samples=num_beams)
+        logits = probs.log()
+        # (batch_size, num_beams)
+        next_scores = logits.gather(dim=1, index=next_tokens)
+    else:
+        scores = F.log_softmax(scores, dim=-1)  # (batch_size, vocab_size)
+        # obtain (batch_size, num_beams), (batch_size, num_beams)
+        next_scores, next_tokens = torch.topk(scores, num_beams, dim=1, largest=True, sorted=True)
+    indices = torch.arange(batch_size, dtype=torch.long).to(device)
+    indices = indices.repeat_interleave(num_beams)
+    inference_params.reorder_state(indices)
+    # batch_size * num_beams x length
+    tokens = tokens.index_select(dim=0, index=indices)
+    # genrated token (batch_size', cur_len)
+    token_ids = torch.cat([tokens, next_tokens.view(-1, 1)], dim=-1)
+    dones = [False] * batch_size
+    beam_scores = next_scores.view(-1)  # batch_size * num_beams
+    cur_len = token_ids.size(1)
+    real_max_length = max_length
+    max_lengths = tokens.new_full((tokens.size(0),), fill_value=max_length, dtype=torch.long)
+    hypos = [
+        BeamHypotheses(num_beams, real_max_length, length_penalty, early_stopping=False) for _ in range(batch_size)
+    ]
+    # 0, num_beams, 2*num_beams, ...
+    batch_inds_with_numbeams_interval = (torch.arange(batch_size) * num_beams).view(-1, 1).to(token_ids)
+    while cur_len < real_max_length:
+        if has_bos:
+            bos_pos = torch.where(token_ids.eq(bos_token_id), 1, 0)
+            bos_sum = bos_pos.cumsum(dim=-1)
+            bos_pos = torch.where(bos_sum.eq(bos_sum[:, -1:]), 0, 1)
+            to_atten_x = bos_pos[:, :, None]
+            to_atten_y = bos_pos[:, None, :]
+            # attention_mask = torch.einsum('bno,bom->bnm', to_atten_x, to_atten_y).eq(1)
+        else:
+            bos_pos = torch.where(token_ids.eq(bos_token_id), 1, 0)
+            to_atten_x = bos_pos[:, :, None]
+            to_atten_y = bos_pos[:, None, :]
+            # attention_mask = torch.einsum('bno,bom->bnm', to_atten_x, to_atten_y).eq(1)
+        attention_mask = torch.logical_or(to_atten_x, to_atten_y).eq(1)
+        inference_params.attention_mask = attention_mask
+        # (bsz x num_beams, vocab_size)
+        if layer_mask is None:
+            if feat_mask is None and ffn_mask is None:
+                scores = decoder(**{"input_ids": token_ids[:, -1:], "inference_params": inference_params})
+            else:
+                scores = decoder(
+                    **{
+                        "input_ids": token_ids[:, -1:],
+                        "inference_params": inference_params,
+                        "feat_mask": feat_mask,
+                        "ffn_mask": ffn_mask,
+                    }
+                )
+        else:
+            scores = decoder(
+                **{
+                    "input_ids": token_ids[:, -1:],
+                    "inference_params": inference_params,
+                    "feat_mask": feat_mask,
+                    "ffn_mask": ffn_mask,
+                    "layer_mask": layer_mask,
+                }
+            )
+        if isinstance(scores, (list, tuple)):
+            scores = scores[0]
+        scores = scores[:, -1].float()
+        inference_params.sequence_len_offset += 1
+        if repetition_penalty != 1.0:
+            token_scores = scores.gather(dim=1, index=token_ids)
+            lt_zero_mask = token_scores.lt(0).float()
+            ge_zero_mask = lt_zero_mask.eq(0).float()
+            token_scores = (
+                lt_zero_mask * repetition_penalty * token_scores + ge_zero_mask / repetition_penalty * token_scores
+            )
+            scores.scatter_(dim=1, index=token_ids, src=token_scores)
+        if _eos_token_id != -1:
+            max_len_eos_mask = max_lengths.eq(cur_len + 1)
+            eos_scores = scores[:, _eos_token_id]
+            scores[:, _eos_token_id] = torch.where(max_len_eos_mask, eos_scores + 1e32, eos_scores)
+        if do_sample:
+            if temperature > 0 and temperature != 1:
+                scores = scores / temperature
+            scores = top_k_top_p_filtering(scores, top_k, top_p, min_tokens_to_keep=num_beams + 1)
+            # add 1e-12 to avoid https://github.com/pytorch/pytorch/pull/27523
+            probs = F.softmax(scores, dim=-1) + 1e-12
+            # batch_size' x (num_beams+1)
+            _tokens = torch.multinomial(probs, num_samples=num_beams + 1)
+            logits = probs.log()
+            # batch_size' x (num_beams+1)
+            _scores = logits.gather(dim=1, index=_tokens)
+            # batch_size' x (num_beams+1)
+            _scores = _scores + beam_scores[:, None]
+            _scores = _scores.view(batch_size, num_beams * (num_beams + 1))
+            next_scores, ids = _scores.topk(2 * num_beams, dim=1, largest=True, sorted=True)
+            _tokens = _tokens.view(batch_size, num_beams * (num_beams + 1))
+            # (batch_size, 2*num_beams)
+            next_tokens = _tokens.gather(dim=1, index=ids)
+            # (batch_size, 2*num_beams)
+            from_which_beam = torch.floor(ids.float() / (num_beams + 1)).long()
+        else:
+            # (batch_size * num_beams, vocab_size)
+            scores = F.log_softmax(scores, dim=-1)
+            # (batch_size * num_beams, vocab_size)
+            _scores = scores + beam_scores[:, None]
+            # (batch_size, num_beams*vocab_size)
+            _scores = _scores.view(batch_size, -1)
+            # (bsz, 2*num_beams)
+            next_scores, ids = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
+            # (batch_size, 2*num_beams)
+            from_which_beam = torch.floor(ids.float() / vocab_size).long()
+            next_tokens = ids % vocab_size  # (batch_size, 2*num_beams)
+        # next_scores, sorted_inds = next_scores.sort(dim=-1, descending=True)
+        # next_tokens = next_tokens.gather(dim=1, index=sorted_inds)
+        # from_which_beam = from_which_beam.gather(dim=1, index=sorted_inds)
+        not_eos_mask = next_tokens.ne(_eos_token_id)
+        keep_mask = not_eos_mask.cumsum(dim=1).le(num_beams)
+        keep_mask = not_eos_mask.__and__(keep_mask)
+        _next_tokens = next_tokens.masked_select(keep_mask).view(-1, 1)
+        _from_which_beam = from_which_beam.masked_select(keep_mask).view(batch_size, num_beams)
+        _next_scores = next_scores.masked_select(keep_mask).view(batch_size, num_beams)
+        beam_scores = _next_scores.view(-1)
+        flag = True
+        if cur_len + 1 == real_max_length:
+            eos_batch_idx = torch.arange(batch_size).to(next_tokens).repeat_interleave(repeats=num_beams, dim=0)
+            eos_beam_ind = torch.arange(num_beams).to(token_ids).repeat(batch_size)
+            eos_beam_idx = from_which_beam[:, :num_beams].reshape(-1)
+        else:
+            effective_eos_mask = next_tokens[:, :num_beams].eq(_eos_token_id)  # batch_size x num_beams
+            if effective_eos_mask.sum().gt(0):
+                eos_batch_idx, eos_beam_ind = effective_eos_mask.nonzero(as_tuple=True)
+                eos_beam_idx = eos_batch_idx * num_beams * 2 + eos_beam_ind
+                eos_beam_idx = from_which_beam.view(-1)[eos_beam_idx]
+            else:
+                flag = False
+        if flag:
+            _token_ids = torch.cat([token_ids, _next_tokens], dim=-1)
+            for batch_idx, beam_ind, beam_idx in zip(
+                eos_batch_idx.tolist(), eos_beam_ind.tolist(), eos_beam_idx.tolist()
+            ):
+                if not dones[batch_idx]:
+                    score = next_scores[batch_idx, beam_ind].item()
+                    if _eos_token_id != -1:
+                        hypos[batch_idx].add(_token_ids[batch_idx * num_beams + beam_idx, :cur_len].clone(), score)
+                    else:
+                        hypos[batch_idx].add(_token_ids[batch_idx * num_beams + beam_idx].clone(), score)
+        reorder_inds = (batch_inds_with_numbeams_interval + _from_which_beam).view(-1)
+        inference_params.reorder_state(reorder_inds)
+        token_ids = torch.cat([token_ids.index_select(index=reorder_inds, dim=0), _next_tokens], dim=-1)
+        for batch_idx in range(batch_size):
+            dones[batch_idx] = (
+                dones[batch_idx]
+                or hypos[batch_idx].is_done(next_scores[batch_idx, 0].item())
+                or max_lengths[batch_idx * num_beams] == cur_len + 1
+            )
+        cur_len += 1
+        if all(dones):
+            break
+    # select the best hypotheses
+    tgt_len = token_ids.new_zeros(batch_size, num_return_sequences)
+    best = []
+    for i, hypotheses in enumerate(hypos):
+        # best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1]
+        sorted_hyp = list(sorted(hypotheses.hyp, key=lambda x: x[0], reverse=True))
+        _best = []
+        for j, hyp in zip(range(num_return_sequences), sorted_hyp):
+            hyp = hyp[1]
+            if _eos_token_id != -1:
+                hyp = torch.cat([hyp, token_ids.new_ones(1) * _eos_token_id])
+            tgt_len[i, j] = len(hyp)
+            _best.append(hyp)
+        best.append(_best)
+    # generate target batch
+    decoded = token_ids.new_zeros(batch_size, num_return_sequences, tgt_len.max().item()).fill_(pad_token_id)
+    for i, hypo in enumerate(best):
+        for j, _hypo in enumerate(hypo):
+            decoded[i, j, : tgt_len[i, j]] = _hypo
+    return decoded
+class BeamHypotheses(object):
+    """
+    BeamHypotheses
+    """
+    def __init__(self, num_beams, max_length, length_penalty, early_stopping):
+        """Initialize n-best list of hypotheses."""
+        self.max_length = max_length - 1  # ignoring bos_token
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.hyp = []
+        self.worst_score = 1e9
+    def __len__(self):
+        """Number of hypotheses in the list."""
+        return len(self.hyp)
+    def add(self, hyp, sum_logprobs):
+        """Add a new hypothesis to the list."""
+        score = sum_logprobs / len(hyp) ** self.length_penalty
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.hyp.append((score, hyp))
+            if len(self) > self.num_beams:
+                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.hyp)])
+                del self.hyp[sorted_scores[0][1]]
+                self.worst_score = sorted_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+    def is_done(self, best_sum_logprobs):
+        """If there are enough hypotheses and that none of the hypotheses being
+        generated can become better than the worst one in the heap, then we are
+        done with this sentence."""
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            return self.worst_score >= best_sum_logprobs / self.max_length**self.length_penalty
+def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
+    """
+    Based on the values of top_k and top_p, set the values that do not meet the criteria to the filter_value.
+    Args:
+        logits: logit value, shape is [bsz, vocab_size].
+        top_k: If it is greater than 0, only the probabilities of the top_k vocabulary are kept, and the rest of
+            the positions are set to filter_value.
+        top_p: according to http://arxiv.org/abs/1904.09751.
+        filter_value: filter value
+        min_tokens_to_keep: The probability of words in each sample‘s returned distribution will not be
+            lower than this value.
+    """
+    if top_k > 0:
+        # Safety check
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))
+        # Remove all tokens with a probability less than the last token of
+        # the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold
+        # (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep
+            # (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token
+        # above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits

InternLM/internlm/core/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .engine import Engine
+from .naive_amp import NaiveAMPModel
+from .trainer import Trainer
+__all__ = [
+    "NaiveAMPModel",
+    "Engine",
+    "Trainer",
+]

InternLM/internlm/core/communication/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from .p2p import (
+    AsynCommunicator,
+    recv_backward,
+    recv_forward,
+    send_backward,
+    send_backward_and_recv_next_backward_async,
+    send_backward_recv_backward,
+    send_backward_recv_forward,
+    send_forward,
+    send_forward_and_recv_next_forward_async,
+    send_forward_backward_recv_forward_backward,
+    send_forward_recv_backward,
+    send_forward_recv_forward,
+)
+from .utils import recv_obj_meta, send_obj_meta
+__all__ = [
+    "send_forward",
+    "send_forward_recv_forward",
+    "send_forward_backward_recv_forward_backward",
+    "send_backward",
+    "send_backward_recv_backward",
+    "send_backward_recv_forward",
+    "send_forward_recv_backward",
+    "recv_backward",
+    "recv_forward",
+    "send_obj_meta",
+    "recv_obj_meta",
+    "send_backward_and_recv_next_backward_async",
+    "send_forward_and_recv_next_forward_async",
+    "AsynCommunicator",
+]

InternLM/internlm/core/communication/p2p.py ADDED Viewed

	@@ -0,0 +1,582 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/communication
+import operator
+from functools import reduce
+from typing import List, Tuple, Union
+import torch
+import torch.distributed as dist
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.utils.common import get_current_device
+from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks
+TensorShape = Union[torch.Size, List[int], Tuple[int]]
+def _get_tensor_shape(tensor_shape: TensorShape, chunk_tensor: bool = False) -> Tuple[TensorShape, bool]:
+    """get the exact tensor shape when communicating and return whether the tensor is a chunk
+    Args:
+        tensor_shape (:class:`torch.Size`): shape of tensor
+        chunk_tensor (bool, optional): whether to chunk tensor, defaults to False
+    Returns:
+        Tuple[Union[:class:`torch.Size`, List[int], Tuple[int]], bool]: exact tensor shape, whether to chunk tensor
+    """
+    if chunk_tensor:
+        tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
+        tensor_parallel_world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        if tensor_chunk_shape % tensor_parallel_world_size == 0:
+            tensor_chunk_shape = tensor_chunk_shape // tensor_parallel_world_size
+        else:
+            tensor_chunk_shape = tensor_shape
+            chunk_tensor = False
+    else:
+        tensor_chunk_shape = tensor_shape
+    return tensor_chunk_shape, chunk_tensor
+def create_recv_buffer_with_shapes(recv_shapes, dtype, scatter_gather_tensors):
+    if isinstance(recv_shapes, torch.Size):
+        recv_chunk_shape, recv_split = _get_tensor_shape(recv_shapes, scatter_gather_tensors)
+        buffer_recv = torch.empty(recv_chunk_shape, requires_grad=True, device=get_current_device(), dtype=dtype)
+        return buffer_recv, recv_split
+    buffer_recv = []
+    for recv_shape in recv_shapes:
+        recv_chunk_shape, recv_split = _get_tensor_shape(recv_shape, scatter_gather_tensors)
+        tensor_recv = torch.empty(recv_chunk_shape, requires_grad=True, device=get_current_device(), dtype=dtype)
+        buffer_recv.append(tensor_recv)
+    return buffer_recv, recv_split
+def process_object_to_send(object_send, scatter_gather_tensors):
+    if isinstance(object_send, torch.Tensor):
+        send_split = _get_tensor_shape(object_send.shape, scatter_gather_tensors)[1]
+        if send_split:
+            object_send = split_tensor_into_1d_equal_chunks(object_send)
+        return object_send
+    object_send_list = []
+    for tensor_send in object_send:
+        send_split = _get_tensor_shape(tensor_send.shape, scatter_gather_tensors)[1]
+        if send_split:
+            object_send_list.append(split_tensor_into_1d_equal_chunks(tensor_send))
+        else:
+            object_send_list.append(tensor_send)
+    object_send = tuple(object_send_list)
+    return object_send
+def filling_ops_queue(obj, comm_op, comm_rank, ops_queue):
+    if isinstance(obj, torch.Tensor):
+        op_to_add = dist.P2POp(comm_op, obj, comm_rank)
+        ops_queue.append(op_to_add)
+    else:
+        for tensor_to_comm in obj:
+            op_to_add = dist.P2POp(comm_op, tensor_to_comm, comm_rank)
+            ops_queue.append(op_to_add)
+def _communicate(
+    object_send_next: Union[torch.Tensor, List[torch.Tensor]] = None,
+    object_send_prev: Union[torch.Tensor, List[torch.Tensor]] = None,
+    recv_prev: bool = False,
+    recv_next: bool = False,
+    recv_prev_shape: Union[torch.Size, List[torch.Size]] = None,
+    recv_next_shape: Union[torch.Size, List[torch.Size]] = None,
+    prev_rank: int = None,
+    next_rank: int = None,
+    dtype: torch.dtype = None,
+    scatter_gather_tensors: bool = False,
+) -> Tuple[Union[torch.Tensor, List[torch.Tensor]]]:
+    """
+    Adapted from megatron.p2p_communication.
+    Communicate tensors between stages. Used as helper method in other
+    communication methods that are used in pipeline schedule.
+    Takes the following arguments:
+        object_send_next (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): tensor to send to next rank
+        (no tensor sent if set to None).
+        object_send_prev (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): tensor to send to prev rank
+        (no tensor sent if set to None).
+        recv_prev (bool): boolean for whether tensor should be received from
+                   previous rank.
+        recv_next (bool): boolean for whether tensor should be received from
+                   next rank.
+        recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received
+        from the previous stage, defualts to None.
+        recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received
+        from the next stage, defualts to None.
+        prev_rank (int): the rank of the previous pipeline stage, defualts to None,
+        next_rank (int): the rank of the next pipeline stage, defualts to None,
+        dtype (torch.dtype): data type of intermediate buffers, defaults to None
+        scatter_gather_tensors (bool): whether to scatter and gather tensor between pipeline stages, defaults to False
+    Returns:
+        Tuple[Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]]: returns tensor_recv_prev, tensor_recv_next
+    """
+    # Create placeholder tensors for receive in forward and backward directions
+    # if needed.
+    tensor_recv_prev = None
+    tensor_recv_next = None
+    if recv_prev:
+        assert recv_prev_shape is not None
+        tensor_recv_prev, recv_prev_split = create_recv_buffer_with_shapes(
+            recv_prev_shape, dtype, scatter_gather_tensors
+        )
+    if recv_next:
+        assert recv_next_shape is not None
+        tensor_recv_next, recv_next_split = create_recv_buffer_with_shapes(
+            recv_next_shape, dtype, scatter_gather_tensors
+        )
+    if object_send_prev is not None or recv_prev:
+        if prev_rank is None:
+            prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
+    if object_send_next is not None or recv_next:
+        if next_rank is None:
+            next_rank = gpc.get_next_global_rank(ParallelMode.PIPELINE)
+    if object_send_prev is not None:
+        object_send_prev = process_object_to_send(object_send_prev, scatter_gather_tensors)
+    if object_send_next is not None:
+        object_send_next = process_object_to_send(object_send_next, scatter_gather_tensors)
+    ops = []
+    if object_send_prev is not None:
+        filling_ops_queue(object_send_prev, dist.isend, prev_rank, ops)
+    if tensor_recv_prev is not None:
+        filling_ops_queue(tensor_recv_prev, dist.irecv, prev_rank, ops)
+    if tensor_recv_next is not None:
+        filling_ops_queue(tensor_recv_next, dist.irecv, next_rank, ops)
+    if object_send_next is not None:
+        filling_ops_queue(object_send_next, dist.isend, next_rank, ops)
+    if len(ops) > 0:
+        reqs = dist.batch_isend_irecv(ops)
+        for req in reqs:
+            req.wait()
+    # To protect against race condition when using batch_isend_irecv().
+    torch.cuda.synchronize()
+    if recv_prev and recv_prev_split:
+        if isinstance(tensor_recv_prev, torch.Tensor):
+            tensor_recv_prev = gather_split_1d_tensor(tensor_recv_prev).view(recv_prev_shape).requires_grad_()
+        else:
+            for index in range(len(tensor_recv_prev)):
+                tensor_recv_prev[index] = (
+                    gather_split_1d_tensor(tensor_recv_prev[index]).view(recv_prev_shape[index]).requires_grad_()
+                )
+    if recv_next and recv_next_split:
+        if isinstance(tensor_recv_next, torch.Tensor):
+            tensor_recv_next = gather_split_1d_tensor(tensor_recv_next).view(recv_next_shape).requires_grad_()
+        else:
+            for index in range(len(tensor_recv_next)):
+                tensor_recv_next[index] = (
+                    gather_split_1d_tensor(tensor_recv_next[index]).view(recv_next_shape[index]).requires_grad_()
+                )
+    return tensor_recv_prev, tensor_recv_next
+def recv_forward(
+    input_tensor_shape, prev_rank=None, dtype=torch.float, scatter_gather_tensors=False
+) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Copy the forward output from the previous stage in pipeline as the input tensor of this stage.
+    Args:
+        input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor
+            to be received.
+        prev_rank (int, optional): The rank of the source of the tensor.
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input tensor or input tensor list.
+    """
+    input_tensor, _ = _communicate(
+        recv_prev=True,
+        recv_prev_shape=input_tensor_shape,
+        prev_rank=prev_rank,
+        dtype=dtype,
+        scatter_gather_tensors=scatter_gather_tensors,
+    )
+    return input_tensor
+def recv_backward(
+    output_grad_shape, next_rank=None, dtype=torch.float, scatter_gather_tensors=False
+) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Copy the gradient tensor from the next stage in pipeline as the input gradient of this stage.
+    Args:
+        output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor
+            to be received.
+        next_rank (int, optional): The rank of the source of the tensor.
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor or gradident tensor list.
+    """
+    _, output_tensor_grad = _communicate(
+        recv_next=True,
+        recv_next_shape=output_grad_shape,
+        next_rank=next_rank,
+        dtype=dtype,
+        scatter_gather_tensors=scatter_gather_tensors,
+    )
+    return output_tensor_grad
+def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False) -> None:
+    """Sends the input tensor to the next stage in pipeline.
+    Args:
+        output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
+        next_rank (int, optional): The rank of the recipient of the tensor.
+    """
+    _communicate(object_send_next=output_tensor, next_rank=next_rank, scatter_gather_tensors=scatter_gather_tensors)
+def send_backward(input_tensor_grad, prev_rank=None, scatter_gather_tensors=False) -> None:
+    """Sends the gradient tensor to the previous stage in pipeline.
+    Args:
+        input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent
+        prev_rank (int, optional): The rank of the recipient of the tensor
+    """
+    _communicate(object_send_prev=input_tensor_grad, prev_rank=prev_rank, scatter_gather_tensors=scatter_gather_tensors)
+def send_forward_recv_backward(
+    output_tensor, output_grad_shape, next_rank=None, dtype=torch.float, scatter_gather_tensors=False
+) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Batched communication operation. Sends the input tensor to the
+    next stage in pipeline, while receives the gradient tensor from the
+    next stage in pipeline as the input gradient tensor of this stage.
+    Args:
+        output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
+        output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor
+            to be received.
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor.
+    """
+    _, output_tensor_grad = _communicate(
+        object_send_next=output_tensor,
+        recv_next=output_grad_shape is not None,
+        recv_next_shape=output_grad_shape,
+        next_rank=next_rank,
+        dtype=dtype,
+        scatter_gather_tensors=scatter_gather_tensors,
+    )
+    return output_tensor_grad
+def send_backward_recv_forward(
+    input_tensor_grad,
+    input_tensor_shape,
+    prev_rank=None,
+    dtype=torch.float,
+    scatter_gather_tensors=False,
+) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Batched communication operation. Sends the gradient tensor to the
+    previous stage in pipeline, while receives the output tensor from the
+    previous stage in pipeline as the input of this stage.
+    Args:
+        input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
+        input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor
+            to be received.
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input tensor.
+    """
+    input_tensor, _ = _communicate(
+        object_send_prev=input_tensor_grad,
+        recv_prev=input_tensor_shape is not None,
+        recv_prev_shape=input_tensor_shape,
+        prev_rank=prev_rank,
+        dtype=dtype,
+        scatter_gather_tensors=scatter_gather_tensors,
+    )
+    return input_tensor
+def send_forward_recv_forward(
+    output_tensor,
+    input_tensor_shape,
+    prev_rank=None,
+    next_rank=None,
+    dtype=torch.float,
+    scatter_gather_tensors=False,
+) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Batched communication operation. Sends the input tensor to the
+    next stage in pipeline, while receives the output tensor from the
+    previous stage in pipeline as the input of this stage.
+    Args:
+        output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
+        input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor
+            to be received.
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input tensor.
+    """
+    input_tensor, _ = _communicate(
+        object_send_next=output_tensor,
+        recv_prev=input_tensor_shape is not None,
+        recv_prev_shape=input_tensor_shape,
+        prev_rank=prev_rank,
+        next_rank=next_rank,
+        dtype=dtype,
+        scatter_gather_tensors=scatter_gather_tensors,
+    )
+    return input_tensor
+def send_backward_recv_backward(
+    input_tensor_grad,
+    output_grad_shape,
+    prev_rank=None,
+    next_rank=None,
+    dtype=torch.float,
+    scatter_gather_tensors=False,
+) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Batched communication operation. Sends the gradient tensor to the
+    previous stage in pipeline, while receives the gradient tensor from the
+    next member in pipeline as the input of this stage.
+    Args:
+        input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
+        output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor
+            to be received.
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor.
+    """
+    _, output_tensor_grad = _communicate(
+        object_send_prev=input_tensor_grad,
+        recv_next=output_grad_shape is not None,
+        recv_next_shape=output_grad_shape,
+        prev_rank=prev_rank,
+        next_rank=next_rank,
+        dtype=dtype,
+        scatter_gather_tensors=scatter_gather_tensors,
+    )
+    return output_tensor_grad
+def send_forward_backward_recv_forward_backward(
+    output_tensor,
+    input_tensor_grad,
+    input_tensor_shape,
+    output_grad_shape,
+    prev_rank=None,
+    next_rank=None,
+    dtype=torch.float,
+    scatter_gather_tensors=False,
+) -> Tuple[Union[torch.Tensor, List[torch.Tensor]]]:
+    """Batched communication operation. Sends the input tensor to the next stage in pipeline and
+    the gradient tensor to the previous stage, while receives the input gradient tensor from the
+    next stage and the input tensor from the previous stage.
+    Args:
+        output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor sent to the next.
+        input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor sent to the previous.
+        input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor received
+            from the previous.
+        output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor received
+            from the next.
+    Returns:
+        Tuple(Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]], Union[:class:`torch.Tensor`,
+        List[:class:`torch.Tensor`]]): (the input tensor, the input gradient tensor)
+    """
+    input_tensor, output_tensor_grad = _communicate(
+        object_send_next=output_tensor,
+        object_send_prev=input_tensor_grad,
+        recv_prev=input_tensor_shape is not None,
+        recv_next=output_grad_shape is not None,
+        recv_prev_shape=input_tensor_shape,
+        recv_next_shape=output_grad_shape,
+        prev_rank=prev_rank,
+        next_rank=next_rank,
+        dtype=dtype,
+        scatter_gather_tensors=scatter_gather_tensors,
+    )
+    return input_tensor, output_tensor_grad
+def send_forward_and_recv_next_forward_async(
+    output_tensor,
+    recv_prev_shape: Union[torch.Size, List[torch.Size]] = None,
+    dtype: torch.dtype = None,
+    scatter_gather_tensors=False,
+):
+    """send forward output to next rank and recv forward input from prev rank"""
+    reqs = []
+    tensor_recv_prev = None
+    # prepare send opreations
+    if output_tensor is not None:
+        next_rank = gpc.get_next_global_rank(ParallelMode.PIPELINE)
+        output_tensor = process_object_to_send(output_tensor, scatter_gather_tensors)
+        if isinstance(output_tensor, torch.Tensor):
+            reqs.append(dist.P2POp(dist.isend, output_tensor, next_rank))
+        else:
+            for tensor_to_comm in output_tensor:
+                reqs.append(dist.P2POp(dist.isend, tensor_to_comm, next_rank))
+    # prepare receive opreations
+    if recv_prev_shape is not None:
+        prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
+        # create receive buffer
+        tensor_recv_prev, recv_prev_split = create_recv_buffer_with_shapes(
+            recv_prev_shape, dtype, scatter_gather_tensors
+        )
+        # generate async receive opterations
+        if isinstance(tensor_recv_prev, torch.Tensor):
+            reqs.append(dist.P2POp(dist.irecv, tensor_recv_prev, prev_rank))
+        else:
+            for tensor_to_comm in tensor_recv_prev:
+                reqs.append(dist.P2POp(dist.irecv, tensor_to_comm, prev_rank))
+    if len(reqs) > 0:
+        reqs = dist.batch_isend_irecv(reqs)
+    # return and do other things
+    yield
+    # check communication completed
+    for req in reqs:
+        req.wait()
+    # To protect against race condition when using batch_isend_irecv()
+    torch.cuda.synchronize()
+    # Process received data
+    if recv_prev_shape is not None and recv_prev_split:
+        if isinstance(tensor_recv_prev, torch.Tensor):
+            tensor_recv_prev = gather_split_1d_tensor(tensor_recv_prev).view(recv_prev_shape).requires_grad_()
+        else:
+            for index in range(len(tensor_recv_prev)):
+                tensor_recv_prev[index] = (
+                    gather_split_1d_tensor(tensor_recv_prev[index]).view(recv_prev_shape[index]).requires_grad_()
+                )
+    yield tensor_recv_prev
+def send_backward_and_recv_next_backward_async(
+    input_tensor,
+    recv_next_shape: Union[torch.Size, List[torch.Size]] = None,
+    dtype: torch.dtype = None,
+    scatter_gather_tensors=False,
+):
+    reqs = []
+    tensor_recv_next = None
+    # prepare send opreations
+    if input_tensor is not None:
+        prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
+        input_tensor = process_object_to_send(input_tensor, scatter_gather_tensors)
+        if isinstance(input_tensor, torch.Tensor):
+            reqs.append(dist.P2POp(dist.isend, input_tensor, prev_rank))
+        else:
+            for tensor_to_comm in input_tensor:
+                reqs.append(dist.P2POp(dist.isend, tensor_to_comm, prev_rank))
+    # prepare receive opreations
+    if recv_next_shape is not None:
+        next_rank = gpc.get_next_global_rank(ParallelMode.PIPELINE)
+        # create receive buffer
+        tensor_recv_next, recv_next_split = create_recv_buffer_with_shapes(
+            recv_next_shape, dtype, scatter_gather_tensors
+        )
+        # generate async receive opreations
+        if isinstance(tensor_recv_next, torch.Tensor):
+            reqs.append(dist.P2POp(dist.irecv, tensor_recv_next, next_rank))
+        else:
+            for tensor_to_comm in tensor_recv_next:
+                reqs.append(dist.P2POp(dist.irecv, tensor_to_comm, next_rank))
+    if len(reqs) > 0:
+        reqs = dist.batch_isend_irecv(reqs)
+    # return and do other things
+    yield
+    # check communication completed
+    for req in reqs:
+        req.wait()
+    # To protect against race condition when using batch_isend_irecv()
+    torch.cuda.synchronize()
+    # Process received data
+    if recv_next_shape is not None and recv_next_split:
+        if isinstance(tensor_recv_next, torch.Tensor):
+            tensor_recv_next = gather_split_1d_tensor(tensor_recv_next).view(recv_next_shape).requires_grad_()
+        else:
+            for index in range(len(tensor_recv_next)):
+                tensor_recv_next[index] = (
+                    gather_split_1d_tensor(tensor_recv_next[index]).view(recv_next_shape[index]).requires_grad_()
+                )
+    yield tensor_recv_next
+class AsynCommunicator:
+    """AsynCommunicator for managing async communication."""
+    def __init__(
+        self,
+        tensor_to_send: Union[torch.Tensor, List[torch.Tensor]],
+        recv_shape: Union[torch.Size, List[torch.Size]],
+        dtype: torch.dtype = None,
+        scatter_gather_tensors=False,
+        forward: bool = True,
+    ) -> None:
+        self._need_receive = recv_shape is not None
+        if forward:
+            self._coroutine = send_forward_and_recv_next_forward_async(
+                tensor_to_send, recv_shape, dtype, scatter_gather_tensors
+            )
+        else:
+            self._coroutine = send_backward_and_recv_next_backward_async(
+                tensor_to_send, recv_shape, dtype, scatter_gather_tensors
+            )
+    @property
+    def need_receive(self) -> bool:
+        return self._need_receive
+    def start(self) -> None:
+        next(self._coroutine)
+    def wait_and_receive(self) -> Union[torch.Tensor, List[torch.Tensor]]:
+        received = next(self._coroutine)
+        self._coroutine.close()
+        return received

InternLM/internlm/core/communication/utils.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/communication
+from typing import List, Tuple, Union
+import torch
+import torch.distributed as dist
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.utils.common import get_current_device
+TensorShape = Union[torch.Size, List[int], Tuple[int]]
+def send_meta_helper(obj, next_rank, tensor_kwargs):
+    send_shape = torch.tensor(obj.size(), **tensor_kwargs)
+    send_ndims = torch.tensor(len(obj.size()), **tensor_kwargs)
+    dist.send(send_ndims, next_rank)
+    dist.send(send_shape, next_rank)
+def send_obj_meta(obj, next_rank=None):
+    """Sends obj meta information before sending a specific obj.
+    Since the recipient must know the shape of the obj in p2p communications,
+    meta information of the obj should be sent before communications. This function
+    synchronizes with :func:`recv_obj_meta`.
+    Args:
+        obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): obj to be sent.
+        need_meta (bool, optional): If False, meta information won't be sent.
+        next_rank (int): The rank of the next member in pipeline parallel group.
+    Returns:
+        bool: False
+    """
+    if next_rank is None:
+        next_rank = gpc.get_next_global_rank(ParallelMode.PIPELINE)
+    tensor_kwargs = {"dtype": torch.long, "device": get_current_device()}
+    if isinstance(obj, torch.Tensor):
+        send_obj_nums = torch.tensor(1, **tensor_kwargs)
+        dist.send(send_obj_nums, next_rank)
+        send_meta_helper(obj, next_rank, tensor_kwargs)
+    else:
+        send_obj_nums = torch.tensor(len(obj), **tensor_kwargs)
+        dist.send(send_obj_nums, next_rank)
+        for tensor_to_send in obj:
+            send_meta_helper(tensor_to_send, next_rank, tensor_kwargs)
+def recv_meta_helper(prev_rank, tensor_kwargs):
+    recv_ndims = torch.empty((), **tensor_kwargs)
+    dist.recv(recv_ndims, prev_rank)
+    recv_shape = torch.empty(recv_ndims, **tensor_kwargs)
+    dist.recv(recv_shape, prev_rank)
+    return recv_shape
+def recv_obj_meta(prev_rank=None) -> torch.Size:
+    """Receives obj meta information before receiving a specific obj.
+    Since the recipient must know the shape of the obj in p2p communications,
+    meta information of the obj should be received before communications. This function
+    synchronizes with :func:`send_obj_meta`.
+    Args:
+        obj_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the obj to be received.
+        prev_rank (int): The rank of the source of the obj.
+    Returns:
+        Union[:class:`torch.Size`, List[:class:`torch.Size`]]: The shape of the obj to be received.
+    """
+    if prev_rank is None:
+        prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
+    tensor_kwargs = {"dtype": torch.long, "device": get_current_device()}
+    recv_obj_nums = torch.empty((), **tensor_kwargs)
+    dist.recv(recv_obj_nums, prev_rank)
+    if recv_obj_nums.item() == 1:
+        recv_shape = recv_meta_helper(prev_rank, tensor_kwargs)
+        obj_shape = torch.Size(recv_shape)
+    else:
+        obj_shape = []
+        for _ in range(recv_obj_nums.item()):
+            recv_shape = recv_meta_helper(prev_rank, tensor_kwargs)
+            obj_shape.append(torch.Size(recv_shape))
+    return obj_shape
+def split_tensor_into_1d_equal_chunks(tensor: torch.Tensor, new_buffer=False) -> torch.Tensor:
+    """Break a tensor into equal 1D chunks.
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be split before communication.
+        new_buffer (bool, optional): Whether to use a new buffer to store sliced tensor.
+    Returns:
+        :class:`torch.Tensor`: The split tensor
+    """
+    partition_size = torch.numel(tensor) // gpc.get_world_size(ParallelMode.TENSOR)
+    start_index = partition_size * gpc.get_local_rank(ParallelMode.TENSOR)
+    end_index = start_index + partition_size
+    if new_buffer:
+        data = torch.empty(partition_size, dtype=tensor.dtype, device=torch.cuda.current_device(), requires_grad=False)
+        data.copy_(tensor.view(-1)[start_index:end_index])
+    else:
+        data = tensor.view(-1)[start_index:end_index]
+    return data
+def gather_split_1d_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    """Opposite of above function, gather values from model parallel ranks.
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be gathered after communication.
+    Returns:
+        :class:`torch.Tensor`: The gathered tensor.
+    """
+    world_size = gpc.get_world_size(ParallelMode.TENSOR)
+    numel = torch.numel(tensor)
+    numel_gathered = world_size * numel
+    gathered = torch.empty(numel_gathered, dtype=tensor.dtype, device=torch.cuda.current_device(), requires_grad=False)
+    chunks = [gathered[i * numel : (i + 1) * numel] for i in range(world_size)]
+    dist.all_gather(chunks, tensor, group=gpc.get_group(ParallelMode.TENSOR))
+    return gathered

InternLM/internlm/core/context/__init__.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from .parallel_context import (
+    IS_TENSOR_PARALLEL,
+    Config,
+    ParallelContext,
+    global_context,
+)
+from .process_group_initializer import (
+    Initializer_Data,
+    Initializer_Model,
+    Initializer_Nettest,
+    Initializer_Pipeline,
+    Initializer_Tensor,
+    Initializer_Zero1,
+    ParallelMode,
+    ProcessGroupInitializer,
+)
+from .random import (
+    add_seed,
+    get_current_mode,
+    get_seeds,
+    get_states,
+    seed,
+    set_mode,
+    set_seed_states,
+    sync_states,
+)
+__all__ = [
+    "Config",
+    "IS_TENSOR_PARALLEL",
+    "global_context",
+    "ParallelContext",
+    "ParallelMode",
+    "Initializer_Tensor",
+    "Initializer_Pipeline",
+    "Initializer_Data",
+    "Initializer_Zero1",
+    "Initializer_Nettest",
+    "ProcessGroupInitializer",
+    "Initializer_Model",
+    "seed",
+    "set_mode",
+    "add_seed",
+    "get_seeds",
+    "get_states",
+    "get_current_mode",
+    "set_seed_states",
+    "sync_states",
+]

InternLM/internlm/core/context/parallel_context.py ADDED Viewed

	@@ -0,0 +1,569 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context
+import inspect
+import random
+import socket
+import sys
+from collections import Counter
+from importlib.machinery import SourceFileLoader
+from pathlib import Path
+from typing import Union
+import numpy as np
+import torch
+import torch.distributed as dist
+from internlm.utils.common import SingletonMeta
+from internlm.utils.logger import get_logger
+from internlm.utils.timeout import LLM_NCCL_TIMEOUT
+from . import process_group_initializer as pgroup_initializer
+from .process_group_initializer import ParallelMode
+from .random import add_seed, get_seeds, set_mode
+IS_TENSOR_PARALLEL = "is_tensor_parallel"
+logger = get_logger(__file__)
+class Config(dict):
+    """This is a wrapper class for dict objects so that values of which can be
+    accessed as attributes.
+    Args:
+        config (dict): The dict object to be wrapped.
+    """
+    def __init__(self, config: dict = None):  # pylint: disable=W0231
+        if config is not None:
+            for k, v in config.items():
+                self._add_item(k, v)
+    def __missing__(self, key):
+        raise KeyError(key)
+    def __getattr__(self, key):
+        try:
+            value = super().__getitem__(key)
+            return value
+        except KeyError:
+            raise AttributeError(key)
+    def __setattr__(self, key, value):
+        super().__setitem__(key, value)
+    def _add_item(self, key, value):
+        if isinstance(value, dict):
+            self.__setattr__(key, Config(value))
+        else:
+            self.__setattr__(key, value)
+    def update(self, config):
+        assert isinstance(config, (Config, dict)), "can only update dictionary or Config objects."
+        for k, v in config.items():
+            self._add_item(k, v)
+        return self
+    @staticmethod
+    def from_file(filename: str) -> object:
+        """Reads a python file and constructs a corresponding :class:`Config` object.
+        Args:
+            filename (str): Name of the file to construct the return object.
+        Returns:
+            :class:`Config`: A :class:`Config` object constructed with information in the file.
+        Raises:
+            AssertionError: Raises an AssertionError if the file does not exist, or the file is not .py file
+        """
+        # check config path
+        if isinstance(filename, str):
+            filepath = Path(filename).absolute()
+        elif isinstance(filename, Path):
+            filepath = filename.absolute()
+        assert filepath.exists(), f"{filename} is not found, please check your configuration path"
+        # check extension
+        extension = filepath.suffix
+        assert extension == ".py", "only .py files are supported"
+        # import the config as module
+        remove_path = False
+        if filepath.parent not in sys.path:
+            sys.path.insert(0, (filepath))
+            remove_path = True
+        module_name = filepath.stem
+        source_file = SourceFileLoader(fullname=str(module_name), path=str(filepath))
+        module = source_file.load_module()  # pylint: disable=W4902,E1120,W1505
+        # load into config
+        config = Config()
+        for k, v in module.__dict__.items():
+            if k.startswith("__") or inspect.ismodule(v) or inspect.isclass(v):
+                continue
+            else:
+                config._add_item(k, v)
+        # remove module
+        del sys.modules[module_name]
+        if remove_path:
+            sys.path.pop(0)
+        return config
+class ParallelContext(metaclass=SingletonMeta):
+    """This class provides interface functions for users to get the parallel context,
+    such as the global rank, the local rank, the world size, etc. of each device.
+    """
+    def __init__(self):
+        # distributed settings
+        self._global_ranks = dict()
+        self._local_ranks = dict()
+        self._world_sizes = dict()
+        self._groups = dict()
+        self._cpu_groups = dict()
+        self._ranks_in_group = dict()
+        # load config from file
+        self._config = None
+        # default parallel args, will be overwritten during process group intialization
+        self.world_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 1
+        self.tensor_parallel_size = 1
+        self.zero1_parallel_size = -1
+        self.nettest_parallel_size = 1
+        self.num_processes_on_current_node = -1
+        self.virtual_pipeline_parallel_size = None
+        self.virtual_pipeline_parallel_rank = None
+    @property
+    def config(self):
+        return self._config
+    def load_config(self, config: Union[dict, str]):
+        """Loads the configuration from either a dict or a file.
+        Args:
+            config (dict or str): Either a dict containing the configuration information or the filename
+                of a file containing the configuration information.
+        Raises:
+            TypeError: Raises a TypeError if `config` is neither a dict nor a str.
+        """
+        if isinstance(config, str):
+            self._config = Config.from_file(config)
+        elif isinstance(config, dict):
+            self._config = Config(config)
+        else:
+            raise TypeError("Invalid type for config, only dictionary or string is supported")
+    def detect_num_processes_on_current_node(self):
+        hostname = socket.gethostname()
+        hostname_list = [None for _ in range(self.get_world_size(ParallelMode.GLOBAL))]
+        dist.all_gather_object(hostname_list, hostname, group=self.get_group(ParallelMode.GLOBAL))
+        counter = Counter(hostname_list)
+        self.num_processes_on_current_node = counter[hostname]
+    @staticmethod
+    def _check_parallel_mode(parallel_mode: ParallelMode):
+        assert isinstance(
+            parallel_mode, ParallelMode
+        ), f"expected the argument parallel_mode to be of enum ParallelMode, but got {type(parallel_mode)}"
+    def get_global_rank(self):
+        """Returns the global rank of the current device.
+        Returns:
+            int: The global rank of the current device
+        """
+        return self._global_ranks[ParallelMode.GLOBAL]
+    def get_local_rank(self, parallel_mode: ParallelMode):
+        """Returns the local rank of the current device.
+        Args:
+            parallel_mode: The parallel mode for the rank.
+        Returns:
+            int: The local rank of the current device for `parallel_mode`.
+        """
+        self._check_parallel_mode(parallel_mode)
+        return self._local_ranks.get(parallel_mode, 0)
+    def get_next_global_rank(self, parallel_mode: ParallelMode):
+        """Returns the global rank of the next device.
+        Args:
+            parallel_mode: The parallel mode for the rank.
+        Returns:
+            int: The global rank of the next device for `parallel_mode`.
+        """
+        self._check_parallel_mode(parallel_mode)
+        # get rank and world size
+        local_rank = self.get_local_rank(parallel_mode)
+        world_size = self.get_world_size(parallel_mode)
+        ranks_in_group = self.get_ranks_in_group(parallel_mode)
+        return ranks_in_group[(local_rank + 1) % world_size]
+    def get_prev_global_rank(self, parallel_mode: ParallelMode):
+        """Returns the global rank of the previous device.
+        Args:
+            parallel_mode: The chosen parallel mode.
+        Returns:
+            int: The global rank of the previous device for `parallel_mode`.
+        """
+        self._check_parallel_mode(parallel_mode)
+        # get rank and world size
+        local_rank = self.get_local_rank(parallel_mode)
+        world_size = self.get_world_size(parallel_mode)
+        ranks_in_group = self.get_ranks_in_group(parallel_mode)
+        return ranks_in_group[(local_rank - 1) % world_size]
+    def is_using_dp(self):
+        """Returns a boolean value indicating whether the current device is initilized with
+        ParallelMode.DATA and its world_size is greater than 1.
+        """
+        return self.is_initialized(ParallelMode.DATA) and self.get_world_size(ParallelMode.DATA) > 1
+    def is_using_tp(self):
+        """Returns a boolean value indicating whether the current device is initilized with
+        ParallelMode.TENSOR and its world_size is greater than 1.
+        """
+        return self.is_initialized(ParallelMode.TENSOR) and self.get_world_size(ParallelMode.TENSOR) > 1
+    def is_using_pp(self):
+        """Returns a boolean value indicating whether the current device is initilized with
+        ParallelMode.PIPELINE and its world_size is greater than 1.
+        """
+        return self.is_initialized(ParallelMode.PIPELINE) and self.get_world_size(ParallelMode.PIPELINE) > 1
+    def is_using_sequence(self):
+        """Returns a boolean value indicating whether the current device is initilized with
+        ParallelMode.SEQUENCE and its world_size is greater than 1.
+        """
+        return False
+        # return gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE) > 1
+    def is_first_rank(self, parallel_mode: ParallelMode):
+        """Returns a boolean value indicating whether the current device is the first one
+        among its group for `parallel_mode`.
+        Args:
+            parallel_mode: The chosen parallel mode.
+        Returns:
+            bool: a boolean value indicating whether the current device is the first one
+            among its group for `parallel_mode`.
+        """
+        rank = 0
+        if self.is_initialized(parallel_mode):
+            rank = self.get_local_rank(parallel_mode)
+        return rank == 0
+    def is_rank_for_log(self):
+        """Returns a boolean value indicating whether the current device should print log."""
+        is_log_rank = (
+            self.is_first_rank(ParallelMode.DATA)
+            and self.is_first_rank(ParallelMode.TENSOR)
+            and self.is_last_rank(ParallelMode.PIPELINE)
+        )
+        return is_log_rank
+    def is_last_rank(self, parallel_mode: ParallelMode):
+        """Returns a boolean value indicating whether the current device is the last one
+        among its group for `parallel_mode`.
+        Args:
+            parallel_mode: The chosen parallel mode.
+        Returns:
+            bool: a boolean value indicating whether the current device is the first one
+            among its group for `parallel_mode`.
+        """
+        rank = 0
+        world_size = 1
+        if self.is_initialized(parallel_mode):
+            rank = self.get_local_rank(parallel_mode)
+            world_size = self.get_world_size(parallel_mode)
+        return rank == world_size - 1
+    def is_pipeline_first_stage(self, ignore_virtual=False):
+        if not ignore_virtual:
+            if self.virtual_pipeline_parallel_size is not None and self.virtual_pipeline_parallel_rank != 0:
+                return False
+        return self.is_first_rank(ParallelMode.PIPELINE)
+    def is_pipeline_last_stage(self, ignore_virtual=False):
+        if not ignore_virtual:
+            if (
+                self.virtual_pipeline_parallel_size is not None
+                and self.virtual_pipeline_parallel_rank != self.virtual_pipeline_parallel_size - 1
+            ):
+                return False
+        return self.is_last_rank(ParallelMode.PIPELINE)
+    def get_world_size(self, parallel_mode: ParallelMode):
+        """Returns the world size for `parallel_mode`.
+        Args:
+            parallel_mode: The chosen parallel mode.
+        Returns:
+            int: The world size for `parallel_mode`.
+        """
+        self._check_parallel_mode(parallel_mode)
+        return self._world_sizes.get(parallel_mode, 1)
+    def get_group(self, parallel_mode: ParallelMode):
+        """Returns the group of the current device for `parallel_mode`.
+        Args:
+            parallel_mode: The chosen parallel mode.
+        Returns:
+            torch.distributed.ProcessGroup: The group of the current device for `parallel_mode`.
+        """
+        self._check_parallel_mode(parallel_mode)
+        return self._groups[parallel_mode]
+    def get_ranks_in_group(self, parallel_mode: ParallelMode):
+        """Returns the rank of the current device for `parallel_mode` in the group.
+        Args:
+            parallel_mode: The chosen parallel mode.
+        Returns:
+            int: The rank of the current device for `parallel_mode` in the group.
+        """
+        self._check_parallel_mode(parallel_mode)
+        return self._ranks_in_group[parallel_mode]
+    def get_cpu_group(self, parallel_mode: ParallelMode):
+        self._check_parallel_mode(parallel_mode)
+        return self._cpu_groups[parallel_mode]
+    def init_global_dist(self, rank: int, world_size: int, backend: str, host: str, port: int, use_cpu: bool = False):
+        """Initializes the global distributed environment
+        Args:
+           rank (int): rank for the default process group.
+           world_size (int): world size of the default process group.
+           backend (str): backend for ``torch.distributed``
+           host (str): the master address for distributed training.
+           port (str): the master port for distributed training.
+           use_cpu (bool): whether to set up cpu process group.
+        """
+        # initialize the default process group
+        init_method = f"tcp://[{host}]:{port}"
+        dist.init_process_group(
+            rank=rank,
+            world_size=world_size,
+            backend=backend,
+            init_method=init_method,
+            timeout=LLM_NCCL_TIMEOUT,
+        )
+        # None will give the default global process group for pytorch dist operations
+        ranks = list(range(world_size))
+        if use_cpu:
+            cpu_group = (
+                dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                if dist.get_backend() != "gloo"
+                else None
+            )
+        else:
+            cpu_group = None
+        self._register_dist(rank, world_size, dist.GroupMember.WORLD, cpu_group, ranks, ParallelMode.GLOBAL)
+        self._global_ranks[ParallelMode.GLOBAL] = rank
+    def _register_dist(self, local_rank, world_size, process_group, cpu_group, ranks_in_group, mode):
+        self._check_parallel_mode(mode)
+        self._local_ranks[mode] = local_rank
+        self._world_sizes[mode] = world_size
+        self._groups[mode] = process_group
+        self._cpu_groups[mode] = cpu_group
+        self._ranks_in_group[mode] = ranks_in_group
+    def check_sanity(self):
+        """Checks sanity of the parallel context.
+        Raises:
+            AssertionError: Raises an AssertionError if the world size does not equal to the product
+                of data parallel size, pipeline parallel size and tensor parallel size.
+        """
+        dps = self.data_parallel_size
+        pps = self.pipeline_parallel_size
+        tps = self.tensor_parallel_size
+        ws = self.world_size
+        assert ws == dps * pps * tps, (
+            f"Expected the world size {ws} to be equal to data"
+            f" parallel size ({dps}) * pipeline parallel size "
+            f"({pps}) * tensor parallel size ({tps})"
+        )
+        assert self.zero1_parallel_size > 0
+        assert self.data_parallel_size % self.zero1_parallel_size == 0
+    def _set_parallel_size_from_config(self, config: dict, key: str, attr_name: str):
+        if key in config:
+            ele = config[key]
+            if isinstance(ele, int):
+                setattr(self, attr_name, ele)
+            elif isinstance(ele, dict):
+                setattr(self, attr_name, ele["size"])
+            else:
+                raise NotImplementedError(
+                    f'{"Parallel configuration does not support this kind of argument, please use int or dict"}'
+                )
+    def init_parallel_groups(self):
+        """Initializes the parallel groups."""
+        # get rank and world size
+        rank = self.get_global_rank()
+        world_size = self.get_world_size(ParallelMode.GLOBAL)
+        self.world_size = world_size
+        # set parallel size as attributes for global context
+        parallel_config = self.config.get("parallel", None)
+        if parallel_config is not None:
+            self._set_parallel_size_from_config(parallel_config, "pipeline", "pipeline_parallel_size")
+            self._set_parallel_size_from_config(parallel_config, "tensor", "tensor_parallel_size")
+            self._set_parallel_size_from_config(parallel_config, "zero1", "zero1_parallel_size")
+        # the user should not set the data parallel size manually
+        # instead, it should be calculated based on other parallel config
+        self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size)
+        # the recommended nettest_parallel_size is 32 GPUs
+        self.nettest_parallel_size = 32
+        if self.zero1_parallel_size <= 0:
+            self.zero1_parallel_size = self.data_parallel_size
+        self.check_sanity()
+        initializer_args = [
+            rank,
+            world_size,
+            self.data_parallel_size,
+            self.pipeline_parallel_size,
+            self.tensor_parallel_size,
+            self.zero1_parallel_size,
+            self.nettest_parallel_size,
+        ]
+        # run initialization of different process groups
+        initializers = []
+        initializers.append(pgroup_initializer.Initializer_Data(*initializer_args))
+        initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
+        initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
+        initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
+        initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args))
+        if self.pipeline_parallel_size > 1:
+            initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args))
+        for initializer in initializers:
+            parallel_setting = initializer.init_dist_group()
+            if isinstance(parallel_setting, list):
+                for args in parallel_setting:
+                    self._register_dist(*args)
+            else:
+                self._register_dist(*parallel_setting)
+    def is_initialized(self, parallel_mode: ParallelMode):
+        """Returns a boolean value indicating whether `parallel_mode` is initialized
+        in the current system.
+        """
+        return parallel_mode in self._groups
+    def destroy(self):
+        """Destroys the current distributed parallel environment."""
+        for mode, group in self._groups.items():
+            if mode is not ParallelMode.GLOBAL:
+                dist.destroy_process_group(group)
+        # destroy global process group
+        dist.destroy_process_group()
+        self._groups.clear()
+    def set_device(self, device_ordinal: int = None):
+        """Sets distributed processes to be bound to devices.
+        Args:
+           device_ordinal (int, optional): the device id to be bound to
+        """
+        global_rank = self.get_global_rank()
+        if device_ordinal is None:
+            devices_per_node = torch.cuda.device_count()
+            device_ordinal = global_rank % devices_per_node
+        torch.cuda.set_device(device_ordinal)
+        logger.info(f"process rank {global_rank} is bound to host:{socket.gethostname()} device: {device_ordinal}")
+    def set_seed(self, seed: int, dpseed_with_tpoffset: bool = False):
+        """Sets seeds for all random libraries.
+        Args:
+            seed (int): seed for random states
+        """
+        pipeline_offset = self._local_ranks.get(ParallelMode.PIPELINE, 0)
+        global_rank = self.get_global_rank()
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        assert torch.cuda.is_available()
+        # data parallel seed are kept the same in the same pipeline stage
+        dp_seed = seed
+        if dpseed_with_tpoffset:
+            dp_seed = seed + pipeline_offset * 1024
+        add_seed(ParallelMode.DATA, dp_seed)
+        add_seed(ParallelMode.DUMMY, dp_seed)
+        # model parallel seeds are different across ranks
+        if self.is_initialized(ParallelMode.TENSOR):
+            tp_rank = self.get_local_rank(ParallelMode.TENSOR)
+            tp_seed = seed + tp_rank + pipeline_offset * 1024
+            add_seed(ParallelMode.TENSOR, tp_seed)
+        # we do not set the random state mode to ParallelMode.DATA until model is built (instead, we use a dummy mode
+        # during model construction), this is because the random state will be different in different tensor parallel
+        # device of the same data parallel group. The underlying reason is that the device of tp_rank = 0 will perform
+        # additional random operations during the RowParallelLinear module building process.
+        set_mode(ParallelMode.DUMMY)
+        seeds = get_seeds()
+        seed_str = ", ".join([f"{k}: {v}" for k, v in seeds.items()])
+        logger.info(
+            f"initialized seed on rank {global_rank}, "
+            f"numpy: {seed}, python random: {seed}, {seed_str},"
+            f"the default parallel seed is {ParallelMode.DATA}."
+        )
+    def set_virtual_pipeline_parallel_size(self, size):
+        self.virtual_pipeline_parallel_size = size
+    def set_virtual_pipeline_parallel_rank(self, rank):
+        self.virtual_pipeline_parallel_rank = rank
+global_context = ParallelContext()

InternLM/internlm/core/context/process_group_initializer.py ADDED Viewed

	@@ -0,0 +1,418 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context
+import math
+from abc import ABC, abstractmethod
+from enum import Enum
+import torch.distributed as dist
+from internlm.utils.timeout import LLM_NCCL_TIMEOUT
+# parallel modes
+class ParallelMode(Enum):
+    """This is an enumeration class containing all possible parallel modes."""
+    GLOBAL = "global"
+    # common parallel
+    DATA = "data"
+    # model parallel - containing tensor and pipeline parallel groups
+    # this is added to facilitate amp and grad clipping in hybrid parallel
+    MODEL = "model"
+    # pipeline parallel
+    PIPELINE = "pipe"
+    # containing all ranks in tensor parallel
+    TENSOR = "tensor"
+    # zero1 parallel
+    ZERO1 = "zero1"
+    # runntime network test
+    NETTEST = "nettest"
+    # dummy mode, only used during mode construction
+    DUMMY = "dummy"
+class ProcessGroupInitializer(ABC):
+    """An object, knowing the parallelism configuration, that initializes parallel groups.
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+        zero1_parallel_size (int): Size of zero1 parallel.
+    """
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        data_parallel_size: int,
+        pipeline_parallel_size: int,
+        tensor_parallel_size: int,
+        zero1_parallel_size: int,
+        nettest_parallel_size: int,
+    ):
+        self.rank = rank
+        self.world_size = world_size
+        self.data_parallel_size = data_parallel_size
+        self.pipeline_parallel_size = pipeline_parallel_size
+        self.tensor_parallel_size = tensor_parallel_size
+        self.zero1_parallel_size = zero1_parallel_size
+        self.nettest_parallel_size = nettest_parallel_size
+        super().__init__()
+    @abstractmethod
+    def init_dist_group(self, use_cpu: bool = False):
+        pass
+class Initializer_Data(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for data parallelism.
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+        zero1_parallel_size (int): Size of zero1 parallel.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.rank_num_per_dp_group = self.world_size // self.data_parallel_size
+        assert self.world_size % self.data_parallel_size == 0
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize data parallel groups, and assign local_ranks and groups to each gpu.
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Data parallelism's information tuple.
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.DATA
+        for i in range(self.rank_num_per_dp_group):
+            ranks = [i + j * self.rank_num_per_dp_group for j in range(self.data_parallel_size)]
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+            if use_cpu:
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
+            else:
+                group_cpu = None
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                cpu_group = group_cpu
+                ranks_in_group = ranks
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+class Initializer_Model(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel
+    groups).
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+        zero1_parallel_size (int): Size of zero1 parallel.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.rank_num_per_group = self.tensor_parallel_size * self.pipeline_parallel_size
+        self.num_group = self.world_size // self.rank_num_per_group
+        assert self.world_size % self.rank_num_per_group == 0
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize model parallel groups, and assign local_ranks and groups to each gpu.
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Model parallelism's information tuple.
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.MODEL
+        for i in range(self.num_group):
+            ranks = [i * self.rank_num_per_group + j for j in range(self.rank_num_per_group)]
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+            if use_cpu:
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
+            else:
+                group_cpu = None
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                cpu_group = group_cpu
+                ranks_in_group = ranks
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+class Initializer_Pipeline(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for pipeline parallelism.
+    Args:
+        rank (int): The rank of current process
+        world_size (int): Size of whole communication world
+        data_parallel_size (int): Size of data parallel
+        pipeline_parallel_size (int): Size of pipeline parallel
+        tensor_parallel_size (int): Size of tensor parallel
+        zero1_parallel_size (int): Size of zero1 parallel.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.rank_num_per_dp_group = self.world_size // self.data_parallel_size
+        self.pipeline_stage_size = self.rank_num_per_dp_group // self.pipeline_parallel_size
+        assert self.world_size % self.data_parallel_size == 0
+        assert self.rank_num_per_dp_group % self.pipeline_parallel_size == 0
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize pipeline parallel groups, and assign local_ranks and groups to each gpu.
+        Returns:
+            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
+                A Pipeline parallelism's information in list of tuples.
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.PIPELINE
+        for i in range(self.data_parallel_size):
+            for j in range(self.pipeline_stage_size):
+                ranks = list(
+                    range(
+                        i * self.rank_num_per_dp_group + j,
+                        (i + 1) * self.rank_num_per_dp_group,
+                        self.pipeline_stage_size,
+                    )
+                )
+                pipe_group_size = len(ranks)
+                pipe_group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+                if use_cpu:
+                    group_cpu = (
+                        dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                        if dist.get_backend() != "gloo"
+                        else pipe_group
+                    )
+                else:
+                    group_cpu = None
+                if self.rank in ranks:
+                    local_rank = ranks.index(self.rank)
+                    group_world_size = pipe_group_size
+                    process_group = pipe_group
+                    cpu_group = group_cpu
+                    ranks_in_group = ranks
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+class Initializer_Tensor(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for tensor parallelism.
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+        zero1_parallel_size (int): Size of zero1 parallel.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_tensor_parallel_group = self.world_size // self.tensor_parallel_size
+        assert self.world_size % self.tensor_parallel_size == 0
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Tensor parallelism's information tuple.
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.TENSOR
+        for i in range(self.num_tensor_parallel_group):
+            ranks = [i * self.tensor_parallel_size + j for j in range(self.tensor_parallel_size)]
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+            if use_cpu:
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
+            else:
+                group_cpu = None
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                cpu_group = group_cpu
+                ranks_in_group = ranks
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+class Initializer_Zero1(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for zero-1 parallelism.
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+        zero1_parallel_size (int): Size of zero-1 parallel.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.rank_num_per_dp_group = self.world_size // self.data_parallel_size
+        self.num_zero1_parallel_group = self.data_parallel_size // self.zero1_parallel_size
+        assert self.world_size % self.data_parallel_size == 0
+        assert self.world_size % self.zero1_parallel_size == 0
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize zero1 parallel groups, and assign local_ranks and groups to each gpu.
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A zero1 parallelism's information tuple.
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.ZERO1
+        for i in range(self.rank_num_per_dp_group):
+            for j in range(self.num_zero1_parallel_group):
+                ranks = [
+                    i + (j * self.zero1_parallel_size + k) * self.rank_num_per_dp_group
+                    for k in range(self.zero1_parallel_size)
+                ]
+                group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+                if use_cpu:
+                    group_cpu = (
+                        dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                        if dist.get_backend() != "gloo"
+                        else group
+                    )
+                else:
+                    group_cpu = None
+                if self.rank in ranks:
+                    local_rank = ranks.index(self.rank)
+                    group_world_size = len(ranks)
+                    process_group = group
+                    cpu_group = group_cpu
+                    ranks_in_group = ranks
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+class Initializer_Nettest(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for network test, especailly for NCCL.
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        nettest_parallel_size (int): Size of a network test group.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_nettest_group = math.ceil(self.world_size / self.nettest_parallel_size)
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Tensor parallelism's information tuple.
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.NETTEST
+        for i in range(self.num_nettest_group):
+            ranks = []
+            for j in range(self.nettest_parallel_size):
+                rank = i * self.nettest_parallel_size + j
+                if rank < self.world_size:
+                    ranks.append(rank)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+            if use_cpu:
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
+            else:
+                group_cpu = None
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                cpu_group = group_cpu
+                ranks_in_group = ranks
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode

InternLM/internlm/core/context/random.py ADDED Viewed

	@@ -0,0 +1,131 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context
+from contextlib import contextmanager
+import torch
+import torch.cuda
+from torch import Tensor
+from .process_group_initializer import ParallelMode
+class SeedManager:
+    """This class is a manager of all random seeds involved in the system."""
+    def __init__(self):
+        self._current_mode = None
+        self._seeds = {}
+        self._seed_states = {}
+    @property
+    def current_mode(self):
+        return self._current_mode
+    @property
+    def seeds(self):
+        return self._seeds
+    @property
+    def seed_states(self):
+        return self._seed_states
+    def set_state(self, parallel_mode: ParallelMode, state: Tensor):
+        """Sets the state of the seed manager for `parallel_mode`."""
+        assert parallel_mode in self._seed_states, f"{parallel_mode} not found in seed manager"
+        self._seed_states[parallel_mode] = state
+    def set_mode(self, parallel_mode: ParallelMode):
+        """Sets the current mode of the seed manager."""
+        if self.current_mode:
+            # save state for current mode
+            self._seed_states[self._current_mode] = torch.cuda.get_rng_state()
+        # set new state for new mode
+        self._current_mode = parallel_mode
+        torch.cuda.set_rng_state(self._seed_states[parallel_mode])
+    def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
+        """Adds a seed to the seed manager for `parallel_mode`."""
+        assert isinstance(parallel_mode, ParallelMode), "Invalid ParallelMode"
+        if not overwrite:
+            assert parallel_mode not in self._seed_states, f"Seed for {parallel_mode} exists"
+        elif parallel_mode in self._seed_states:
+            print(f"Warning: {parallel_mode} seed overwritten.", flush=True)
+        current_state = torch.cuda.get_rng_state()
+        torch.cuda.manual_seed(seed)
+        self._seed_states[parallel_mode] = torch.cuda.get_rng_state()
+        self._seeds[parallel_mode] = seed
+        torch.cuda.set_rng_state(current_state)
+    def reset(self):
+        self._current_mode = None
+        self._seeds = {}
+        self._seed_states = {}
+_SEED_MANAGER = SeedManager()
+def get_seeds():
+    """Returns the seeds of the seed manager.
+    Returns:
+        dict: The seeds of the seed manager.
+    """
+    return _SEED_MANAGER.seeds
+def get_states(copy=False):
+    """Returns the seed states of the seed manager.
+    Returns:
+        dict: The seed states of the seed manager.
+    """
+    states = _SEED_MANAGER.seed_states
+    if copy:
+        new_states = dict()
+        for parallel_mode, state in states.items():
+            new_states[parallel_mode] = state.clone()
+        return new_states
+    else:
+        return _SEED_MANAGER.seed_states
+def get_current_mode():
+    """Returns the current mode of the seed manager.
+    Returns:
+        :class:`torch.ByteTensor`: The current mode of the seed manager.
+    """
+    return _SEED_MANAGER.current_mode
+def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
+    """Adds a seed to the seed manager for `parallel_mode`."""
+    _SEED_MANAGER.add_seed(parallel_mode, seed, overwrite)
+def set_mode(parallel_mode: ParallelMode):
+    """Sets the current mode of the seed manager."""
+    _SEED_MANAGER.set_mode(parallel_mode)
+def set_seed_states(parallel_mode: ParallelMode, state: Tensor):
+    """Sets the state of the seed manager for `parallel_mode`."""
+    _SEED_MANAGER.set_state(parallel_mode, state)
+def sync_states():
+    current_mode = get_current_mode()
+    current_states = torch.cuda.get_rng_state()
+    set_seed_states(current_mode, current_states)
+@contextmanager
+def seed(parallel_mode: ParallelMode):
+    """A context for seed switch"""
+    current_mode = _SEED_MANAGER.current_mode
+    try:
+        yield _SEED_MANAGER.set_mode(parallel_mode)
+    finally:
+        _SEED_MANAGER.set_mode(current_mode)

InternLM/internlm/core/engine.py ADDED Viewed

	@@ -0,0 +1,227 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
+from typing import List, Optional
+import torch
+from torch.nn import Module
+from torch.nn.modules.loss import _Loss
+from torch.optim.lr_scheduler import _LRScheduler
+from internlm.core.gradient_handler import BaseGradientHandler
+from internlm.solver.beta2_scheduler import Beta2Scheduler
+from internlm.solver.optimizer.hybrid_zero_optim import BaseOptimizer
+from internlm.utils.common import get_batch_size, move_to_device
+class Engine:
+    """
+    The Engine class is responsible for managing the training and evaluation process of a neural network model.
+    It handles the forward and backward passes, parameter updates, gradient handling, and mode switching between
+    training and evaluation.
+    Args:
+        model (torch.nn.Module): The neural network model to be trained or evaluated.
+        optimizer (BaseOptimizer): The optimizer used for updating the parameters of the model.
+        lr_scheduler (torch.optim.lr_scheduler._LRScheduler, optional): The learning rate scheduler for the optimizer.
+                                                                        Default is None.
+        beta2_scheduler (internlm.solver.beta2_scheduler.Beta2Scheduler, optional): The beta2 scheduler for the
+                                                                                    optimizer. Default is None.
+        criterion (torch.nn.modules.loss._Loss, optional): The loss function used for calculating the loss during
+                                                           training. Default is None.
+        gradient_handlers (List[BaseGradientHandler], optional): A list of gradient handlers used in the backward pass.
+                                                                 Default is None.
+        clip_grad_norm (float, optional): The norm value for gradient clipping. Default is 0.0.
+    Examples:
+        >>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
+        >>> model = ...
+        >>> criterion = ...
+        >>> optimizer = ...
+        >>> train_dataloader = ...
+        >>> engine, _, _, _ = internlm.initialize_engine(model, optimizer, criterion)
+        >>> engine.train()
+        >>> for inputs, labels in train_dataloader
+        >>>     # set gradients to zero
+        >>>     engine.zero_grad()
+        >>>     # run forward pass
+        >>>     outputs = engine(inputs)
+        >>>     # compute loss value and run backward pass
+        >>>     loss = engine.criterion(outputs, labels)
+        >>>     engine.backward(loss)
+        >>>     # update parameters
+        >>>     engine.step()
+    """
+    def __init__(
+        self,
+        model: Module,
+        optimizer: BaseOptimizer,
+        lr_scheduler: Optional[_LRScheduler] = None,
+        beta2_scheduler: Optional[Beta2Scheduler] = None,
+        criterion: Optional[_Loss] = None,
+        gradient_handlers: Optional[List[BaseGradientHandler]] = None,
+        clip_grad_norm: float = 0.0,
+    ):
+        self._model = model
+        self._optimizer = optimizer
+        self._lr_scheduler = lr_scheduler
+        self._beta2_scheduler = beta2_scheduler
+        self._criterion = criterion
+        self._clip_grad_norm = clip_grad_norm
+        # state
+        self.training = True  # default
+        # build gradient handler
+        self._gradient_handlers = gradient_handlers if gradient_handlers else []
+    @property
+    def model(self):
+        """Returns the model attached to the engine."""
+        return self._model
+    @property
+    def optimizer(self):
+        """Returns the optimizer attached to the engine."""
+        return self._optimizer
+    @property
+    def criterion(self):
+        """Returns the criterion (loss function) attached to the engine."""
+        return self._criterion
+    def _all_reduce_gradients(self):
+        """Handles all-reduce operations of gradients across different parallel groups."""
+        for handler in self._gradient_handlers:
+            handler.handle_gradient()
+    def zero_grad(self):
+        """Sets the gradient of all parameters in the model to zero."""
+        self.optimizer.zero_grad()
+    def step(self):
+        """
+        Executes the parameter update step. This includes all-reduce operations of gradients, gradient clipping,
+        and parameter update. If successful, it also steps the learning rate scheduler and beta2 scheduler
+        if they exist.
+        Returns:
+            success (bool): Whether the parameter update was successful.
+            grad_norm (float): The norm of the gradient after clipping.
+        """
+        self._all_reduce_gradients()
+        self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
+        success, grad_norm = self.optimizer.step()
+        if success and self._lr_scheduler is not None:
+            self._lr_scheduler.step()
+        if success and self._beta2_scheduler is not None:
+            self._beta2_scheduler.step()
+        return success, grad_norm
+    def train(self):
+        """Sets the model to training mode."""
+        self.training = True
+        self._model.train()
+    def eval(self):
+        """Sets the model to evaluation mode."""
+        self.training = False
+        self._model.eval()
+    def backward(self, loss: torch.Tensor):
+        """
+        Starts the backward propagation given the loss value computed by a loss function.
+        Args:
+            loss (torch.Tensor): The loss value computed by a loss function.
+        """
+        return self.optimizer.backward(loss)
+    def backward_by_grad(self, tensor, grad):
+        """
+        Starts the backward propagation given the gradient of the output tensor.
+        Args:
+            tensor (torch.Tensor): The output tensor.
+            grad (torch.Tensor): The gradient passed back to the output tensor.
+        """
+        return self.optimizer.backward_by_grad(tensor, grad)
+    def __call__(self, *args, **kwargs):
+        """
+        Runs the forward step for the model.
+        Returns:
+            torch.Tensor: The output of the model.
+        """
+        return self.model(*args, **kwargs)
+    def load_batch(self, data_iter, to_gpu=True):
+        """
+        Loads a batch from the data iterator. It returns the data and labels which are
+        already in the same GPU as where the model is.
+        Args:
+            data_iter (Iterable): The data iterator from which to get a batch of data, obtained by calling
+                                  iter(dataloader).
+            to_gpu (bool, optional): Whether the data should be moved to the GPU. Default is True.
+        Returns:
+            Tuple (torch.Tensor, torch.Tensor): A tuple of (data, label).
+        """
+        if data_iter is None:
+            raise RuntimeError("Dataloader is not defined.")
+        try:
+            batch_data = next(data_iter)
+        except TypeError:
+            batch_data = data_iter
+        if to_gpu:
+            batch_data = move_to_device(batch_data)
+        batch_size = get_batch_size(batch_data)
+        return batch_data, batch_size
+class KDEngine(Engine):
+    def __init__(
+            self,
+            model: Module,
+            teacher: Module,
+            optimizer: BaseOptimizer,
+            lr_scheduler: Optional[_LRScheduler] = None,
+            beta2_scheduler: Optional[Beta2Scheduler] = None,
+            criterion: Optional[_Loss] = None,
+            kd_criterion: Optional[_Loss] = None,
+            gradient_handlers: Optional[List[BaseGradientHandler]] = None,
+            clip_grad_norm: float = 0.0,
+    ):
+        self._teacher = teacher
+        self._kd_criterion = kd_criterion
+        super().__init__(
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            beta2_scheduler=beta2_scheduler,
+            criterion=criterion,
+            gradient_handlers=gradient_handlers,
+            clip_grad_norm=clip_grad_norm,
+        )
+    @property
+    def teacher(self):
+        """Returns the model attached to the engine."""
+        return self._teacher
+    @property
+    def kd_criterion(self):
+        """Returns the model attached to the engine."""
+        return self._kd_criterion

InternLM/internlm/core/gradient_handler.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from abc import ABC, abstractmethod
+from collections import defaultdict
+import torch
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from internlm.core.context import global_context as gpc
+class BaseGradientHandler(ABC):
+    """A basic helper class to handle all-reduce operations of gradients across different parallel groups
+    before optimization.
+    Args:
+        model (Module): Model where the gradients accumulate.
+        optimizer (Optimizer): Optimizer for updating the parameters.
+    """
+    def __init__(self, model, optimizer):
+        self._model = model
+        self._optimizer = optimizer
+    @abstractmethod
+    def handle_gradient(self):
+        """A method to accumulate gradients across different parallel groups. Users should
+        write their own functions or just use the functions in pre-defined subclasses.
+        """
+        pass
+class PipelineSharedModuleGradientHandler(BaseGradientHandler):
+    """A helper class to handle all-reduce operations in sub parallel groups.
+    A all-reduce collective communication will be operated in
+    :func:`handle_gradient` among all sub pipeline parallel groups.
+    For better performance, it bucketizes the gradients of all parameters that are
+    the same type to improve the efficiency of communication.
+    Args:
+        model (Module): Model where the gradients accumulate.
+        optimizer (Optimizer): Optimizer for updating the parameters.
+    """
+    def handle_gradient(self):
+        """A method running a all-reduce operation in sub pipeline parallel groups."""
+        if gpc.pipeline_parallel_size > 1:
+            # bucketize and all-reduce
+            buckets = defaultdict(lambda: defaultdict(list))
+            # Pack the buckets.
+            for param in self._model.parameters():
+                group = getattr(param, "pipeline_shared_module_pg", None)
+                if (
+                    param.requires_grad
+                    and group is not None
+                    and (
+                        (hasattr(param, "colo_attr") and not param.colo_attr.saved_grad.is_null())
+                        or param.grad is not None
+                    )
+                ):
+                    tp = param.data.type()
+                    buckets[group][tp].append(param)
+            # For each bucket, all-reduce and copy all-reduced grads.
+            for group, group_buckets in buckets.items():
+                for tp, bucket in group_buckets.items():
+                    grads = [
+                        param.colo_attr.grad_payload if hasattr(param, "colo_attr") else param.grad.data
+                        for param in bucket
+                    ]
+                    coalesced = _flatten_dense_tensors(grads).to(torch.cuda.current_device())
+                    dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=group)
+                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)

InternLM/internlm/core/naive_amp.py ADDED Viewed

	@@ -0,0 +1,136 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# adopted from https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/amp
+from typing import Any
+import torch
+import torch.distributed as dist
+from torch import Tensor, nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.distributed import ReduceOp
+from internlm.core.context import ParallelMode
+from internlm.core.context.parallel_context import global_context as gpc
+class NaiveAMPModel(nn.Module):
+    """
+    This is a wrapper class for a model that automatically casts the model, its inputs, and outputs into fp16.
+    It also provides options to cast the output back to fp32 and to synchronize buffers.
+    Args:
+        model (torch.nn.Module): The model to be wrapped and cast into fp16.
+        output_to_fp32 (bool, optional): If True, the output of this module is cast into fp32. Defaults to True.
+        parallel_mode (:class:`internlm.core.context.ParallelMode`): The parallel group mode used in this module.
+                                                                Defaults to ``ParallelMode.DATA``.
+        sync_buffer (bool, optional): If True, the buffers are synchronized. Defaults to True.
+    """
+    def __init__(
+        self,
+        model: nn.Module,
+        output_to_fp32: bool = True,
+        parallel_mode: ParallelMode = ParallelMode.DATA,
+        sync_buffer: bool = True,
+        dtype=torch.float16,
+    ):
+        super().__init__()
+        self.model = model.to(dtype)
+        self._output_to_fp32 = output_to_fp32
+        self._sync_buf = sync_buffer
+        self.dtype = dtype
+        if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
+            self._process_group = gpc.get_group(parallel_mode)
+            self._world_size = gpc.get_world_size(parallel_mode)
+        else:
+            self._process_group = None
+            self._world_size = 1
+            self._sync_buf = False
+        self._first_eval_run = False
+    @property
+    def sync_buffer(self):
+        """Returns the current state of the buffer synchronization."""
+        return self._sync_buf
+    @sync_buffer.setter
+    def sync_buffer(self, state: bool):
+        """Sets the state of the buffer synchronization."""
+        self._sync_buf = state
+    def _convert_to_fp16(self, input_: Any):
+        """Converts the input to fp16 if it is a Tensor of dtype float32."""
+        if isinstance(input_, Tensor) and input_.dtype == torch.float32:
+            input_ = input_.to(self.dtype)
+        return input_
+    def _convert_to_fp32(self, input_: Any):
+        """Converts the input to fp32 if it is a Tensor of dtype float16."""
+        if isinstance(input_, Tensor) and input_.dtype == torch.float16:
+            input_ = input_.float()
+        return input_
+    def convert_to_fp32(self, out):
+        """Converts the output to fp32"""
+        if isinstance(out, Tensor):
+            out = self._convert_to_fp32(out)
+        elif isinstance(out, (tuple, list)):
+            out = [self._convert_to_fp32(val) for val in out]
+        elif isinstance(out, dict):
+            out = {key: self._convert_to_fp32(val) for key, val in out.items()}
+        return out
+    def _reduce_module_buffer(self):
+        """
+        All-reduces the buffers (e.g., running stats of batch normalization) across
+        data parallel ranks so that all the ranks will produce consistent results
+        when given the same input.
+        """
+        buf_list = []
+        # find valid buffers
+        for buf in self.model.buffers():
+            if buf is not None:
+                buf_list.append(buf)
+        # reduce buffers across data parallel ranks
+        if buf_list:
+            coalesced_buf = _flatten_dense_tensors(buf_list)
+            coalesced_buf.div_(self._world_size)
+            dist.all_reduce(coalesced_buf, op=ReduceOp.SUM, group=self._process_group)
+            unflattened_buf_list = _unflatten_dense_tensors(coalesced_buf, buf_list)
+            for old, new in zip(buf_list, unflattened_buf_list):
+                old.copy_(new)
+    def eval(self):
+        """Sets the model to evaluation mode. Buffers are only synchronized in the first eval iteration."""
+        self.model.eval()
+        self._first_eval_run = True
+    def forward(self, *args, **kwargs):
+        """
+        Performs a forward pass on the model. Buffers are synchronized before the forward pass.
+        The inputs are converted to fp16 and the outputs are optionally converted back to fp32.
+        """
+        if (self.training or self._first_eval_run) and self._sync_buf:
+            with torch.no_grad():
+                self._reduce_module_buffer()
+            if self._first_eval_run:
+                self._first_eval_run = False
+        if args:
+            args = [self._convert_to_fp16(arg) for arg in args]
+        if kwargs:
+            for k, v in kwargs.items():
+                kwargs[k] = self._convert_to_fp16(v)
+        out = self.model(*args, **kwargs)
+        if self._output_to_fp32:
+            out = self.convert_to_fp32(out)
+        return out

InternLM/internlm/core/scheduler/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from .base_scheduler import BaseScheduler, SchedulerHook, SchedulerMetricHook
+from .no_pipeline_scheduler import NonPipelineScheduler, KDNonPipelineScheduler
+from .pipeline_scheduler import InterleavedPipelineScheduler, PipelineScheduler, KDPipelineScheduler
+__all__ = [
+    "BaseScheduler",
+    "NonPipelineScheduler",
+    "KDNonPipelineScheduler",
+    "InterleavedPipelineScheduler",
+    "PipelineScheduler",
+    "KDPipelineScheduler",
+    "SchedulerHook",
+    "SchedulerMetricHook",
+]

InternLM/internlm/core/scheduler/base_scheduler.py ADDED Viewed

	@@ -0,0 +1,187 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Iterable, Optional
+import torch
+from internlm.core.engine import Engine
+from internlm.utils.megatron_timers import megatron_timer as timer
+class BaseScheduler(ABC):
+    """A basic helper class to control the process of training or evaluation.
+    It mainly composes of forward_backward_step for gradient backward and
+    optimizer_step for parameters update.
+    For the convenience to enable FP16, we aggregate all codes that contain the
+    control of FP16 in class schedule.
+    Args:
+        data_process_func (Callable, optional): The preprocessing function which receives a batch of data and arranges
+            them into data and label.
+    """
+    def __init__(self, data_process_func: Callable = None):
+        self.data_process_func = data_process_func
+    @abstractmethod
+    def pre_processing(self, engine: Engine):
+        """To perform actions before running the schedule.
+        Args:
+           engine (internlm.core.Engine): InternLM engine for training and inference.
+        """
+        pass
+    def _load_micro_batch(self, data, label, offset, micro_bsz):
+        assert isinstance(data, dict) and isinstance(label, torch.Tensor)
+        micro_batch_data = {k: v[offset : offset + micro_bsz] for k, v in data.items()}
+        micro_batch_label = label[offset : offset + micro_bsz]
+        return micro_batch_data, micro_batch_label
+    @abstractmethod
+    def forward_backward_step(
+        self,
+        engine: Engine,
+        data_iter: Iterable,
+        forward_only: bool,
+        return_loss: bool = True,
+        return_output_label: bool = True,
+    ):
+        """The process function over a batch of dataset for training or evaluation.
+        Args:
+            engine (internlm.core.Engine): InternLM engine for training and inference.
+            data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
+            forward_only (bool): If True, the process won't include backward.
+            return_loss (bool, optional): If False, the loss won't be returned.
+            return_output_label (bool, optional): If False, the output and label won't be returned.
+        """
+        pass
+    @staticmethod
+    def _call_engine(engine: Engine, inputs: Any):
+        """Calls the engine with the given inputs.
+        Args:
+            engine (internlm.core.Engine): InternLM engine for training and inference.
+            inputs (Any): The inputs to the engine, can be of type torch.Tensor, list, tuple, or dict.
+        """
+        if isinstance(inputs, torch.Tensor):
+            return engine(inputs)
+        elif isinstance(inputs, (list, tuple)):
+            return engine(*inputs)
+        elif isinstance(inputs, dict):
+            return engine(**inputs)
+        else:
+            raise TypeError(
+                f"Expected engine inputs to be of type torch.Tensor, list, tuple, or dict, but got {type(inputs)}"
+            )
+    @staticmethod
+    def _call_engine_criterion(criterion, outputs: Any, labels: Any):
+        """Calls the engine's criterion with the given outputs and labels.
+        Args:
+            engine (internlm.core.Engine): InternLM engine for training and inference.
+            outputs (Any): The outputs from the model, can be of type torch.Tensor, list, tuple, or dict.
+            labels (Any): The labels for the outputs, can be of type torch.Tensor, list, tuple, or dict.
+        """
+        assert isinstance(
+            outputs, (torch.Tensor, list, tuple, dict)
+        ), f"Expect output of model is (torch.Tensor, list, tuple), got {type(outputs)}"
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        if isinstance(labels, torch.Tensor):
+            labels = (labels,)
+        if isinstance(outputs, (tuple, list)) and isinstance(labels, (tuple, list)):
+            return criterion(*outputs, *labels)
+        elif isinstance(outputs, (tuple, list)) and isinstance(labels, dict):
+            return criterion(*outputs, **labels)
+        elif isinstance(outputs, dict) and isinstance(labels, dict):
+            return criterion(**outputs, **labels)
+        elif isinstance(outputs, dict) and isinstance(labels, (list, tuple)):
+            raise ValueError(f"Expected labels to be a dict when the model outputs are dict, but got {type(labels)}")
+        else:
+            raise TypeError(
+                f"Expected model outputs and labels to be of type torch.Tensor ' \
+                '(which is auto-converted to tuple), list, tuple, or dict, ' \
+                'but got {type(outputs)} (model outputs) and {type(labels)} (labels)"
+            )
+class SchedulerHook(ABC):
+    """
+    Scheduler Hook.
+    """
+    @abstractmethod
+    def before_forward(self, scheduler, inputs) -> None:
+        """Actions before forward"""
+    @abstractmethod
+    def after_forward(self, scheduler, outputs) -> None:
+        """Actions after forward"""
+    @abstractmethod
+    def before_criterion(self, scheduler, outputs, label) -> None:
+        """Actions before criterion"""
+    @abstractmethod
+    def after_criterion(self, scheduler, loss) -> None:
+        """Actions after criterion"""
+    @abstractmethod
+    def before_backward(self, scheduler, outputs, outputs_grad) -> None:
+        """Actions before backward"""
+    @abstractmethod
+    def after_backward(self, scheduler, inputs_grad) -> None:
+        """Actions after backward"""
+    @abstractmethod
+    def post_helper_func(self, scheduler, outputs, label) -> None:
+        """A post helper function"""
+class SchedulerMetricHook(SchedulerHook):
+    """
+    Scheduler Metric Hook.
+    """
+    def __init__(self, metric: Optional[Callable] = None, skip: bool = False) -> None:
+        self._post_func = metric
+        self._skip = skip
+    def before_forward(self, scheduler, inputs) -> None:
+        if not self._skip:
+            timer("fwd").start()
+    def after_forward(self, scheduler, outputs) -> None:
+        if not self._skip:
+            timer("fwd").stop()
+    def before_criterion(self, scheduler, outputs, label) -> None:
+        if not self._skip:
+            timer("cal_loss").start()
+    def after_criterion(self, scheduler, loss) -> None:
+        if not self._skip:
+            timer("cal_loss").stop()
+    def before_backward(self, scheduler, outputs, outputs_grad) -> None:
+        if not self._skip:
+            timer("bwd").start()
+    def after_backward(self, scheduler, inputs_grad) -> None:
+        if not self._skip:
+            timer("bwd").stop()
+    def post_helper_func(self, scheduler, outputs, label) -> None:
+        if self._post_func is not None:
+            self._post_func(outputs, label)

InternLM/internlm/core/scheduler/no_pipeline_scheduler.py ADDED Viewed

	@@ -0,0 +1,266 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
+from typing import Any, Callable, Iterable, List, Optional
+import torch
+from internlm.core.context import global_context as gpc
+from internlm.core.engine import Engine, KDEngine
+from internlm.utils.common import conditional_context
+from internlm.utils.timeout import llm_timeout
+from collections import defaultdict
+from .base_scheduler import BaseScheduler, SchedulerHook
+class NonPipelineScheduler(BaseScheduler):
+    """A helper schedule class for no pipeline parallelism running environment.
+    During one process, it loads a batch of dataset and feeds it to the model.
+    After getting the output and calculating the loss, it will use :meth:`step`
+    to update the parameters if it is in training mode.
+    Args:
+        data_process_func (Callable, optional): The preprocessing function which receives a batch of data
+            and returns a tuple in the form of (data, label), and it will be executed in load_batch.
+        gradient_accumulation_steps(int, optional): the steps of gradient accumulation, 1 for disable
+            gradient accumulation.
+    Examples:
+        >>> # this shows an tools of customized data_process_func
+        >>> def data_process_func(dataloader_output):
+        >>>     item1, item2, item3 = dataloader_output
+        >>>     data = (item1, item2)
+        >>>     label = item3
+        >>>     return data, label
+    """
+    def __init__(
+        self,
+        data_process_func: Callable = None,
+        gradient_accumulation_size: int = 1,
+        scheduler_hooks: Optional[List[SchedulerHook]] = None,
+    ):
+        self._grad_accum_size = gradient_accumulation_size
+        self._grad_accum_offset = 0
+        self._hooks = scheduler_hooks
+        super().__init__(data_process_func)
+    def pre_processing(self, engine: Engine):
+        """Performs actions before running the schedule.
+        Args:
+           engine (internlm.core.Engine): InternLM engine for training and inference.
+        """
+        pass
+    def _call_hooks(self, func_name: str, *args, **kwargs) -> None:
+        for hook in self._hooks:
+            getattr(hook, func_name)(self, *args, **kwargs)
+    def _load_accum_batch(self, data: Any, label: Any):
+        """Loads a batch of data and label for gradient accumulation.
+        Args:
+            data (Any): The data to be loaded.
+            label (Any): The label to be loaded.
+        """
+        _data, _label = self._load_micro_batch(
+            data=data, label=label, offset=self._grad_accum_offset, micro_bsz=self._grad_accum_batch_size
+        )
+        self._grad_accum_offset += self._grad_accum_batch_size
+        if self.data_process_func:
+            _data["input_ids"] = self.data_process_func(_data["input_ids"], _data["cu_seqlens"])
+            _label = self.data_process_func(_label, _data["cu_seqlens"])
+            _data.pop("cu_seqlens")
+            _data.pop("indexes")
+        return _data, _label
+    def _train_one_batch(
+        self,
+        data: Any,
+        label: Any,
+        engine: Engine,
+        forward_only: bool = False,
+        return_loss: bool = True,
+        scale_loss: int = 1,
+    ):
+        """Trains one batch of data.
+        Args:
+            data (Any): The data to be trained.
+            label (Any): The label for the data.
+            engine (internlm.core.Engine): InternLM engine for training and inference.
+            forward_only (bool, optional): If True, the model is run for the forward pass, else back propagation will
+                be executed.
+            return_loss (bool, optional): Loss will be returned if True.
+            scale_loss (int, optional): The scale factor for the loss.
+        """
+        # forward
+        with conditional_context(torch.no_grad(), enable=forward_only):
+            self._call_hooks("before_forward", data)
+            output = self._call_engine(engine, data)
+            self._call_hooks("after_forward", output)
+            self._call_hooks("post_helper_func", output, label)
+            if return_loss:
+                self._call_hooks("before_criterion", output, label)
+                loss = self._call_engine_criterion(engine.criterion, output, label)
+                self._call_hooks("after_criterion", loss)
+                loss /= scale_loss
+        # backward
+        if not forward_only:
+            self._call_hooks("before_backward", None, None)
+            engine.backward(loss)
+            self._call_hooks("after_backward", None)
+        if not return_loss:
+            loss = None
+        return output, dict(loss=loss)
+    @llm_timeout(func_name="nopp_forward_backward_step")
+    def forward_backward_step(
+        self,
+        engine: Engine,
+        data_iter: Iterable,
+        forward_only: bool = False,
+        return_loss: bool = True,
+        return_output_label: bool = True,
+    ):
+        """The process function that loads a batch of dataset and feeds it to the model.
+        The returned labels and loss will None if :attr:`return_loss` is False.
+        Args:
+            engine (internlm.core.Engine): InternLM engine for training and inference.
+            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
+            forward_only (bool, optional):
+                If True, the model is run for the forward pass, else back propagation will be executed.
+            return_loss (bool, optional): Loss will be returned if True.
+            return_output_label (bool, optional): Output and label will be returned if True.
+        Returns:
+            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
+        """
+        assert (
+            forward_only or return_loss
+        ), "The argument 'return_loss' has to be True when 'forward_only' is False, but got False."
+        batch_data, batch_size = engine.load_batch(data_iter)
+        assert (
+            batch_size % self._grad_accum_size == 0
+        ), f"batch_size:{batch_size} must be an integer multiple of gradient accumulation steps:{self._grad_accum_size}"
+        self._grad_accum_batch_size = batch_size // self._grad_accum_size
+        data, label = batch_data
+        loss = defaultdict(int) if return_loss else None
+        outputs = []
+        labels = []
+        # reset accumulation microbatch offset
+        self._grad_accum_offset = 0
+        for _current_accum_step in range(self._grad_accum_size):
+            if _current_accum_step == self._grad_accum_size - 1:
+                engine.optimizer.skip_grad_reduce = False
+            else:
+                engine.optimizer.skip_grad_reduce = True
+            _data, _label = self._load_accum_batch(data, label)
+            _output, _loss = self._train_one_batch(
+                _data, _label, engine, forward_only, return_loss, self._grad_accum_size
+            )
+            if return_loss:
+                for k in _loss:
+                    loss[k] += _loss[k]
+            if return_output_label:
+                outputs.append(_output)
+                labels.append(_label)
+        if not return_output_label:
+            outputs, labels = None, None
+        return outputs, labels, loss
+class KDNonPipelineScheduler(NonPipelineScheduler):
+    def __init__(
+            self,
+            data_process_func: Callable = None,
+            gradient_accumulation_size: int = 1,
+            scheduler_hooks: Optional[List[SchedulerHook]] = None,
+    ):
+        super().__init__(
+            data_process_func=data_process_func,
+            gradient_accumulation_size=gradient_accumulation_size,
+            scheduler_hooks=scheduler_hooks,
+        )
+    def _train_one_batch(
+            self,
+            data: Any,
+            label: Any,
+            engine: KDEngine,
+            forward_only: bool = False,
+            return_loss: bool = True,
+            scale_loss: int = 1,
+    ):
+        """Trains one batch of data.
+        Args:
+            data (Any): The data to be trained.
+            label (Any): The label for the data.
+            engine (internlm.core.Engine): InternLM engine for training and inference.
+            forward_only (bool, optional): If True, the model is run for the forward pass, else back propagation will
+                be executed.
+            return_loss (bool, optional): Loss will be returned if True.
+            scale_loss (int, optional): The scale factor for the loss.
+        """
+        # forward
+        with conditional_context(torch.no_grad(), enable=forward_only):
+            self._call_hooks("before_forward", data)
+            output = self._call_engine(engine, data)
+            self._call_hooks("after_forward", output)
+            self._call_hooks("post_helper_func", output, label)
+            if return_loss:
+                self._call_hooks("before_criterion", output, label)
+                loss_gt = gpc.config.kd_config['gt_weight'] * self._call_engine_criterion(engine.criterion, output, label)
+                with torch.no_grad():
+                    engine.teacher.eval()
+                    output_t = self._call_engine(engine.teacher, data)
+                loss_kd = gpc.config.kd_config['kd_weight'] * self._call_engine_criterion(engine.kd_criterion, output, (output_t, label))
+                self._call_hooks("after_criterion", loss_gt + loss_kd)
+                loss_gt /= scale_loss
+                loss_kd /= scale_loss
+        # backward
+        if not forward_only:
+            self._call_hooks("before_backward", None, None)
+            engine.backward(loss_gt+loss_kd)
+            self._call_hooks("after_backward", None)
+        if not return_loss:
+            loss_gt = None
+            loss_kd = None
+        return output, dict(loss_gt=loss_gt, loss_kd=loss_kd)

InternLM/internlm/core/scheduler/pipeline_scheduler.py ADDED Viewed

	@@ -0,0 +1,1363 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
+from contextlib import contextmanager
+from typing import Callable, List, Optional, Tuple, Union
+import torch.cuda
+import internlm.core.communication as comm
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.core.engine import Engine
+from internlm.core.naive_amp import NaiveAMPModel
+from internlm.utils.common import get_current_device, move_to_device
+from internlm.utils.logger import get_logger
+from internlm.utils.timeout import llm_timeout
+from collections import defaultdict
+from .base_scheduler import BaseScheduler, SchedulerHook
+logger = get_logger(__file__)
+def get_tensor_shape():
+    if hasattr(gpc.config, "TENSOR_SHAPE"):
+        return gpc.config.TENSOR_SHAPE
+    if not gpc.is_initialized(ParallelMode.PIPELINE):
+        return None
+    if hasattr(gpc.config, "SEQ_LEN") and hasattr(gpc.config.data, "micro_bsz") and hasattr(gpc.config, "HIDDEN_SIZE"):
+        if gpc.config.model.use_flash_attn:
+            if gpc.config.parallel.sequence_parallel:
+                sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
+                tensor_shape = (
+                    gpc.config.SEQ_LEN * gpc.config.data["micro_bsz"] // sequence_world_size,
+                    gpc.config.HIDDEN_SIZE,
+                )
+            else:
+                tensor_shape = (
+                    gpc.config.SEQ_LEN * gpc.config.data["micro_bsz"],
+                    gpc.config.HIDDEN_SIZE,
+                )
+        else:
+            tensor_shape = (
+                gpc.config.data["micro_bsz"],
+                gpc.config.SEQ_LEN,
+                gpc.config.HIDDEN_SIZE,
+            )
+        return tensor_shape
+    else:
+        return None
+def pack_return_tensors(return_tensors):
+    output, label = tuple(zip(*return_tensors))
+    if isinstance(output[0], torch.Tensor):
+        output = torch.cat(output, dim=0)
+    elif isinstance(output[0], (list, tuple)):
+        output = tuple(torch.cat(tensors, dim=0) for tensors in zip(*output))
+    else:
+        raise TypeError("Output of model must be tensor or list/tuple of tensors")
+    if isinstance(label[0], torch.Tensor):
+        label = torch.cat(label, dim=0)
+    else:
+        merged_label = {k: [] for k in label[0].keys()}
+        for d in label:
+            for k, v in d.items():
+                merged_label[k].append(v)
+        label = {k: torch.cat(v, dim=0) for k, v in merged_label.items()}
+    return output, label
+@contextmanager
+def switch_virtual_pipeline_parallel_rank(rank):
+    prev_rank = gpc.virtual_pipeline_parallel_rank
+    try:
+        gpc.set_virtual_pipeline_parallel_rank(rank)
+        yield
+    finally:
+        gpc.set_virtual_pipeline_parallel_rank(prev_rank)
+@contextmanager
+def switch_optimizer_grad_sync_skip_mode(optimizer, skip: bool = True):
+    prev_mode = optimizer.skip_grad_reduce
+    try:
+        optimizer.skip_grad_reduce = skip
+        yield
+    finally:
+        optimizer.skip_grad_reduce = prev_mode
+class PipelineScheduler(BaseScheduler):
+    """
+    A helper schedule class for pipeline parallelism running environment.
+    It uses non-interleaved 1F1B strategy. Other properties are similar as
+    :class:`NonPipelineSchedule`.
+    Args:
+        num_microbatches (int): The number of microbatches.
+        dtype (torch.dtype): Type of data. torch.float by default.
+        data_process_func (Callable, optional):
+            The post processing function which receives a micro batch of data, and it will be executed
+            in `load_micro_batch`.
+        tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
+        scatter_gather_tensors (bool, optional):
+            If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
+        scheduler_hooks (Optional[List[SchedulerHook]], optional): List of scheduler hooks.
+    """
+    def __init__(
+        self,
+        num_microbatches: int,
+        dtype: torch.dtype = torch.float,
+        data_process_func: Callable = None,
+        tensor_shape: Union[torch.Size, List[int], Tuple[int]] = None,
+        scatter_gather_tensors: bool = False,
+        scheduler_hooks: Optional[List[SchedulerHook]] = None,
+    ):
+        assert num_microbatches > 0, f"expected num_microbatches to be larger then 1, but got {num_microbatches}"
+        assert not isinstance(
+            tensor_shape, int
+        ), "tensor_shape type should be one of Union[torch.Size, List[int], Tuple[int]]."
+        super().__init__(data_process_func=data_process_func)
+        self.num_microbatches = num_microbatches
+        self.dtype = dtype
+        self._hooks = scheduler_hooks
+        self._tensor_shape = (
+            tensor_shape if tensor_shape is None or isinstance(tensor_shape, torch.Size) else torch.Size(tensor_shape)
+        )
+        self.scatter_gather_tensors = (
+            scatter_gather_tensors
+            and gpc.is_initialized(ParallelMode.TENSOR)
+            and gpc.get_world_size(ParallelMode.TENSOR) > 1
+        )
+        if gpc.config.parallel.sequence_parallel:
+            self.scatter_gather_tensors = False
+        # cache for the batch data
+        self.batch_data = None
+    @property
+    def tensor_shape(self) -> torch.Size:
+        return self._tensor_shape
+    @tensor_shape.setter
+    def tensor_shape(self, tensor_shape: torch.Size):
+        self._tensor_shape = tensor_shape
+    def pre_processing(self, engine):
+        types = set()
+        for param in engine.model.parameters():
+            types.add(param.dtype)
+        assert len(types) == 1, f"Mixed types of parameter detected, {types}"
+        self.dtype = types.pop()
+    @staticmethod
+    def _call_engine(engine, data):  # pylint: disable=W0237
+        if data is None:
+            return None
+        if isinstance(data, torch.Tensor):
+            return engine(data)
+        elif isinstance(data, (list, tuple)):
+            return engine(*data)
+        elif isinstance(data, dict):
+            stage_output = data.pop("stage_output", None)
+            if stage_output is None:
+                return engine(**data)
+            elif isinstance(stage_output, torch.Tensor):
+                return engine(stage_output, **data)
+            elif isinstance(stage_output, (tuple, list)):
+                return engine(*stage_output, **data)
+            else:
+                raise TypeError(
+                    f"Expected stage_output to be of type torch.Tensor, list, or tuple, "
+                    f"but got {type(stage_output)}"
+                )
+        else:
+            raise TypeError(f"Expected data to be of type torch.Tensor, list, tuple, or dict, but got {type(data)}")
+    def load_batch(self, engine, data_iter):
+        # Pipeline schedule just puts data in memory
+        batch_data, batch_size = engine.load_batch(data_iter, to_gpu=False)
+        assert batch_size % self.num_microbatches == 0, "Batch size should divided by the number of microbatches"
+        self.microbatch_offset = 0
+        self.batch_size = batch_size
+        self.batch_data, self.batch_label = batch_data
+        self.microbatch_size = self.batch_size // self.num_microbatches
+    def load_micro_batch(self):
+        micro_batch_data, micro_batch_label = self._load_micro_batch(
+            data=self.batch_data, label=self.batch_label, offset=self.microbatch_offset, micro_bsz=self.microbatch_size
+        )
+        if self.data_process_func:
+            micro_batch_data["input_ids"] = self.data_process_func(
+                micro_batch_data["input_ids"], micro_batch_data["cu_seqlens"]
+            )
+            micro_batch_label = self.data_process_func(micro_batch_label, micro_batch_data["cu_seqlens"])
+            micro_batch_data.pop("cu_seqlens")
+            micro_batch_data.pop("indexes")
+        micro_batch_data["label"] = micro_batch_label
+        self.microbatch_offset += self.microbatch_size
+        return move_to_device(micro_batch_data)
+    def _get_data_label_for_current_step(self, stage_output, micro_batch_data):
+        if isinstance(micro_batch_data, (tuple, list)):
+            if gpc.is_first_rank(ParallelMode.PIPELINE):
+                # for the first stage, we use the data from the
+                # dataloader output by default
+                data, label = micro_batch_data
+            else:
+                # for non-first stage, we use the output passed
+                # by the previous as the model input
+                data = stage_output
+                _, label = micro_batch_data
+        elif isinstance(micro_batch_data, dict):
+            label = micro_batch_data.pop("label", None)
+            data = {"stage_output": stage_output, **micro_batch_data}
+        return data, label
+    def _call_hooks(self, func_name: str, *args, **kwargs) -> None:
+        for hook in self._hooks:
+            getattr(hook, func_name)(self, *args, **kwargs)
+    def _get_current_microbatch_id(self, step_id: int) -> int:
+        """
+        Get the current microbatch ID based on the step ID.
+        In 1f1b scheduler, the microbatch ID is the same as the step ID,
+        but it is important to note that the step ID is calculated separately
+        for forward and backward passes.
+        """
+        return step_id
+    def _forward_step(self, engine, input_obj, return_tensors, return_output_label=True, accum_loss=None):
+        """
+        Forward step for passed-in model. If it is the first stage, the input tensor
+        is obtained from data_iterator, otherwise the passed-in input_obj is used.
+        Returns output tensor. This is a helper function and can be ignored by users.
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Input tensor for this pipeline stage.
+            return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
+            return_output_label (bool, optional): Whether returns output labels.
+            accum_loss (optional): Where accumulated loss stores.
+        Returns:
+            Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: output or the loss value of the current
+                pipeline stage.
+        """
+        micro_batch_data = self.load_micro_batch()
+        data, label = self._get_data_label_for_current_step(input_obj, micro_batch_data)
+        self._call_hooks("before_forward", data)
+        output_obj = self._call_engine(engine.model, data)
+        self._call_hooks("after_forward", output_obj)
+        if gpc.is_last_rank(ParallelMode.PIPELINE):
+            self._call_hooks("post_helper_func", output_obj, label)
+            if return_output_label:
+                return_tensors.append((output_obj, label))
+            if accum_loss is not None:
+                self._call_hooks("before_criterion", output_obj, label)
+                loss = self._call_engine_criterion(engine.criterion, output_obj, label)
+                self._call_hooks("after_criterion", loss)
+                loss_reduced = loss / self.num_microbatches
+                accum_loss['loss'].add_(loss_reduced.detach())
+                output_obj = loss_reduced
+        return output_obj
+    def _backward_step(self, engine, step_id, input_obj, output_obj, output_obj_grad):
+        """
+        Backward step through the passed-in output tensor. If it is the last stage, the
+        output_obj_grad is None, otherwise it is the gradients with respect to stage's output tensor.
+        Returns the gradients with respect to the input tensor (None if first stage).
+        This is a helper function and can be ignored by users.
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            step_id (int): The ID of the current step.
+            input_obj (Union[torch.Tensor, List[torch.Tensor]]): Input tensor for this stage.
+            output_obj (Union[torch.Tensor, List[torch.Tensor]]): Output tensor for this stage.
+            output_obj_grad (Union[torch.Tensor, List[torch.Tensor]]): Gradient of output tensor for this stage.
+        Returns:
+            Union[torch.Tensor, List[torch.Tensor]]: Gradient of input tensor.
+        """
+        # Retain the grad on the input_obj.
+        if input_obj is not None:
+            if isinstance(input_obj, torch.Tensor):
+                input_obj.retain_grad()
+            else:
+                for in_tensor in input_obj:
+                    if in_tensor is not None:
+                        in_tensor.retain_grad()
+        # Backward pass.
+        # Only the last microbatch does syncing grad.
+        skip_grad_sync = self._get_current_microbatch_id(step_id) != self.num_microbatches - 1
+        self._call_hooks("before_backward", output_obj, output_obj_grad)
+        with switch_optimizer_grad_sync_skip_mode(engine.optimizer, skip_grad_sync):
+            if output_obj_grad is None:
+                engine.backward(output_obj)
+            else:
+                engine.backward_by_grad(output_obj, output_obj_grad)
+        # Collect the grad of the input_obj.
+        input_obj_grad = None
+        if input_obj is not None:
+            if isinstance(input_obj, torch.Tensor):
+                input_obj_grad = input_obj.grad
+            else:
+                input_obj_grad = []
+                for in_tensor in input_obj:
+                    input_obj_grad.append(in_tensor.grad)
+        self._call_hooks("after_backward", input_obj_grad)
+        return input_obj_grad
+    def _forward_only_step(self, engine, return_loss=True, return_output_label=True):
+        """
+        This function performs forward only computation process. The scheduling of microbatches is similar to the
+        warmup phase, where each microbatch first receives the forward input from the previous stage, then performs
+        the forward computation, and finally passes the forward computation output to the next stage. There are two
+        special cases to note:
+        1. The first stage of the pipeline does not need to receive forward input; its input comes from the dataloader.
+        2. The last stage of the pipeline does not need to send forward output; its output is returned to the user code
+           for processing.
+        Args:
+            engine (colossalai.engine.Engine): internlm engine for training and inference.
+            return_loss (bool, optional): Whether to return the accumulated loss.
+            return_output_label (bool, optional): Whether to return outputs and labels.
+        Returns:
+            Tuple[Union[torch.Tensor, None], Union[torch.Tensor, None], Union[torch.Tensor, None]]:
+                output, label, and accumulated loss.
+        """
+        # Input, output tensors only need to be saved when doing backward passes
+        return_tensors = []
+        accum_loss_init_func = lambda: torch.zeros(1, device=get_current_device())
+        accum_loss = defaultdict(accum_loss_init_func) if return_loss and gpc.is_pipeline_last_stage(
+            ignore_virtual=True) else None
+        # Used for tensor meta information communication
+        forward_recv_shapes = self.tensor_shape
+        need_forward_meta = self.tensor_shape is None
+        # Run all forward passes.
+        for _ in range(self.num_microbatches):
+            # Receive input from the previous stage
+            if not gpc.is_first_rank(ParallelMode.PIPELINE):
+                if forward_recv_shapes is None:
+                    forward_recv_shapes = comm.recv_obj_meta()
+                input_obj = comm.recv_forward(
+                    forward_recv_shapes,
+                    dtype=self.dtype,
+                    scatter_gather_tensors=self.scatter_gather_tensors,
+                )
+            else:
+                input_obj = None
+            # Perform forward computation
+            output_obj = self._forward_step(
+                engine,
+                input_obj,
+                return_tensors,
+                return_output_label=return_output_label,
+                accum_loss=accum_loss,
+            )
+            if not gpc.is_last_rank(ParallelMode.PIPELINE):
+                if need_forward_meta:
+                    comm.send_obj_meta(output_obj)
+                    need_forward_meta = False  # send only once.
+                # Send the forward computation output to the next stage
+                comm.send_forward(output_obj, scatter_gather_tensors=self.scatter_gather_tensors)
+        output, label = pack_return_tensors(return_tensors) if len(return_tensors) > 0 else (None, None)
+        return output, label, accum_loss
+    def _forward_backward_step(self, engine, return_loss=True, return_output_label=True):
+        """
+        This function schedules the forward and backward computation of microbatches in the pipeline in a 1F1B manner.
+        It consists of three stages: warmup, 1F1B, and cooldown.
+        1. Warmup Stage:
+        The warmup stage performs num_warmup forward microsteps. The calculation of num_warmup is the pipeline length
+        minus the rank of the current pipeline minus 1. For each microstep, it receives data as input from the previous
+        stage, performs the forward computation, and then sends the result to the next stage.
+        2. 1F1B Stage:
+        The 1F1B stage consists of pairs of forward and backward microsteps. It performs num_1f1b_micropairs iterations,
+        where num_1f1b_micropairs is calculated as the total number of microbatches minus the number of microbatches in
+        the warmup stage. In each iteration, it first performs a forward computation, sends the result to the next
+        stage, receives input for the backward computation, performs the backward computation, and finally sends the
+        result to the previous stage to receive input for the next forward computation.
+        3. Cooldown Stage:
+        The cooldown stage performs the same number of iterations as the warmup stage. In each iteration, it receives
+        input for the backward computation, performs the backward computation, and finally sends the result to the
+        previous stage.
+        There are two special cases to consider:
+        1. The first stage of the pipeline does not need to receive forward input or send backward output. The last
+        stage does not need to send forward output or receive backward input.
+        2. Pay attention to the communication between stages and use additional communication to bridge the gap.
+        Args:
+            engine (Engine): The engine used for computation.
+            return_loss (bool, optional): Whether to return the accumulated loss.
+            return_output_label (bool, optional): Whether to return outputs and labels.
+        Returns:
+            Tuple[Union[torch.Tensor, None], Union[torch.Tensor, None], Union[torch.Tensor, None]]:
+            The output, label, and accumulated loss.
+        """
+        num_warmup_microsteps = (
+            gpc.get_world_size(ParallelMode.PIPELINE) - gpc.get_local_rank(ParallelMode.PIPELINE) - 1
+        )
+        num_warmup_microsteps = min(num_warmup_microsteps, self.num_microbatches)
+        num_1f1b_micropairs = self.num_microbatches - num_warmup_microsteps
+        # Input, output tensors only need to be saved when doing backward passes
+        input_objs = []
+        output_objs = []
+        return_tensors = []
+        accum_loss_init_func = lambda: torch.zeros(1, device=get_current_device())
+        accum_loss = defaultdict(accum_loss_init_func) if return_loss and gpc.is_pipeline_last_stage(
+            ignore_virtual=True) else None
+        # Used for tensor meta information communication
+        forward_recv_shapes = self.tensor_shape
+        backward_recv_shapes = None
+        need_forward_meta = self.tensor_shape is None
+        # Run warmup forward passes.
+        for i in range(num_warmup_microsteps):
+            # Receive the input from the previous stage
+            if not gpc.is_first_rank(ParallelMode.PIPELINE):
+                if forward_recv_shapes is None:
+                    forward_recv_shapes = comm.recv_obj_meta()
+                input_obj = comm.recv_forward(
+                    forward_recv_shapes,
+                    dtype=self.dtype,
+                    scatter_gather_tensors=self.scatter_gather_tensors,
+                )
+            else:
+                input_obj = None
+            # Perform forward computation
+            output_obj = self._forward_step(
+                engine,
+                input_obj,
+                return_tensors,
+                return_output_label=return_output_label,
+                accum_loss=accum_loss,
+            )
+            if not gpc.is_last_rank(ParallelMode.PIPELINE):
+                if isinstance(output_obj, torch.Tensor):
+                    backward_recv_shapes = output_obj.shape
+                else:
+                    backward_recv_shapes = [out_tensor.shape for out_tensor in output_obj]
+                if need_forward_meta:
+                    comm.send_obj_meta(output_obj)
+                    need_forward_meta = False  # send only once.
+            # Send the output of forward computation of this pipeline stage to the next pipeline stage as input for
+            # forward computation
+            if not gpc.is_last_rank(ParallelMode.PIPELINE):
+                comm.send_forward(output_obj, scatter_gather_tensors=self.scatter_gather_tensors)
+            input_objs.append(input_obj)
+            output_objs.append(output_obj)
+        # Before running 1F1B, need to receive first forward tensor.
+        # If all microbatches are run in warmup / cooldown phase, then no need to
+        # receive this tensor here.
+        if num_1f1b_micropairs > 0:
+            if not gpc.is_first_rank(ParallelMode.PIPELINE):
+                if forward_recv_shapes is None:
+                    forward_recv_shapes = comm.recv_obj_meta(forward_recv_shapes)
+                input_obj = comm.recv_forward(
+                    forward_recv_shapes,
+                    dtype=self.dtype,
+                    scatter_gather_tensors=self.scatter_gather_tensors,
+                )
+            else:
+                input_obj = None
+        # Run 1F1B in steady state.
+        for i in range(num_1f1b_micropairs):
+            # Perform forward computation
+            output_obj = self._forward_step(
+                engine,
+                input_obj,
+                return_tensors,
+                return_output_label=return_output_label,
+                accum_loss=accum_loss,
+            )
+            if gpc.is_last_rank(ParallelMode.PIPELINE):
+                output_obj_grad = None
+            else:
+                output_obj_grad = comm.send_forward_recv_backward(
+                    output_obj,
+                    backward_recv_shapes,
+                    dtype=self.dtype,
+                    scatter_gather_tensors=self.scatter_gather_tensors,
+                )
+            # Add input_obj and output_obj to end of list.
+            input_objs.append(input_obj)
+            output_objs.append(output_obj)
+            # Pop output_obj and output_obj from the start of the list for
+            # the backward pass.
+            input_obj = input_objs.pop(0)
+            output_obj = output_objs.pop(0)
+            input_obj_grad = self._backward_step(engine, i, input_obj, output_obj, output_obj_grad)
+            if i == (num_1f1b_micropairs - 1):
+                input_obj = None
+                if not gpc.is_first_rank(ParallelMode.PIPELINE):
+                    comm.send_backward(
+                        input_obj_grad,
+                        scatter_gather_tensors=self.scatter_gather_tensors,
+                    )
+            else:
+                if gpc.is_first_rank(ParallelMode.PIPELINE):
+                    input_obj = None
+                else:
+                    input_obj = comm.send_backward_recv_forward(
+                        input_obj_grad,
+                        forward_recv_shapes,
+                        dtype=self.dtype,
+                        scatter_gather_tensors=self.scatter_gather_tensors,
+                    )
+        # Run cooldown backward passes.
+        for i in range(num_warmup_microsteps):
+            input_obj = input_objs.pop(0)
+            output_obj = output_objs.pop(0)
+            if not gpc.is_last_rank(ParallelMode.PIPELINE):
+                output_obj_grad = comm.recv_backward(
+                    backward_recv_shapes,
+                    dtype=self.dtype,
+                    scatter_gather_tensors=self.scatter_gather_tensors,
+                )
+            else:
+                output_obj_grad = None
+            input_obj_grad = self._backward_step(
+                engine, num_1f1b_micropairs + i, input_obj, output_obj, output_obj_grad
+            )
+            if not gpc.is_first_rank(ParallelMode.PIPELINE):
+                comm.send_backward(input_obj_grad, scatter_gather_tensors=self.scatter_gather_tensors)
+        output, label = pack_return_tensors(return_tensors) if len(return_tensors) > 0 else (None, None)
+        return output, label, accum_loss
+    @llm_timeout(func_name="nointerleaved_forward_backward_step")
+    def forward_backward_step(self, engine, data_iter, forward_only=False, return_loss=True, return_output_label=True):
+        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
+        Returns a tuple with losses if the last stage, an empty tuple otherwise.
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
+            forward_only (bool, optional):
+                Whether run forward step only. Default is false. If true, no backward will be run.
+            return_loss (bool, optional): Whether returns the loss value. Default is true.
+            return_output_label (bool, optional): If False, the output and label won't be returned.
+        Returns:
+            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
+        """
+        assert (
+            forward_only or return_loss
+        ), "The argument 'return_loss' has to be True when 'forward_only' is False, but got False."
+        # Load data first
+        self.load_batch(engine, data_iter)
+        if forward_only:
+            return self._forward_only_step(engine, return_loss, return_output_label)
+        else:
+            return self._forward_backward_step(engine, return_loss, return_output_label)
+class InterleavedPipelineScheduler(PipelineScheduler):
+    """
+    Interleaved Pipeline Scheduler.
+    """
+    def __init__(
+        self,
+        num_microbatches: int,
+        num_chunks: int,
+        dtype: torch.dtype = torch.float,
+        data_process_func: Callable = None,
+        tensor_shape: Union[torch.Size, List[int], Tuple[int]] = None,
+        scatter_gather_tensors: bool = False,
+        scheduler_hooks: Optional[List[SchedulerHook]] = None,
+        communication_overlap: bool = False,
+    ):
+        """A helper schedule class for pipeline parallelism running environment.
+        It uses interleaved 1F1B strategy. Other properties are similar as
+        :class:`NonPipelineSchedule`.
+        Args:
+            num_microbatches (int): The number of microbatches.
+            num_chunks (int): The number of model chunks.
+            dtype (torch.dtype, optional): The data type of the tensors. Default is torch.float.
+            data_process_func (Callable, optional):
+                The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
+            tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
+            scatter_gather_tensors (bool, optional):
+                If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
+            scheduler_hooks (List[SchedulerHook], optional): List of scheduler hooks. Default is None.
+            communication_overlap (bool, optional): Whether to enable communication overlap. Default is False.
+        """
+        assert (
+            num_microbatches % gpc.get_world_size(ParallelMode.PIPELINE) == 0
+        ), "num_microbatches must be an integer multiple of pipeline parallel world size"
+        assert (
+            isinstance(num_chunks, int) and num_chunks > 0
+        ), f"expected num_chunks to be an integer and larger than 0, but got {num_chunks}"
+        super().__init__(
+            num_microbatches,
+            dtype=dtype,
+            data_process_func=data_process_func,
+            tensor_shape=tensor_shape,
+            scatter_gather_tensors=scatter_gather_tensors,
+            scheduler_hooks=scheduler_hooks,
+        )
+        gpc.set_virtual_pipeline_parallel_size(num_chunks)
+        gpc.set_virtual_pipeline_parallel_rank(0)
+        self._num_chunks = num_chunks
+        self._communication_overlap = communication_overlap
+        # switch 1f1b loop runner function according to communication overlap
+        self._run_1f1b_loop = (
+            self._run_1f1b_loop_with_overlap if communication_overlap else self._run_1f1b_loop_without_overlap
+        )
+        # states
+        self._pp_size = gpc.get_world_size(ParallelMode.PIPELINE)
+        self._pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+        self._accum_loss = None
+        self._return_tensors = None
+        self._input_objs = [[] for _ in range(num_chunks)]
+        self._output_objs = [[] for _ in range(num_chunks)]
+        self._output_obj_grads = [[] for _ in range(num_chunks)]
+        self._input_obj_shapes = [self.tensor_shape for _ in range(num_chunks)]
+        self._output_obj_shapes = [None for _ in range(num_chunks)]
+        self._send_tensor_shape_flags = [self.tensor_shape is None for _ in range(num_chunks)]
+    @property
+    def tensor_shape(self) -> torch.Size:
+        return self._tensor_shape
+    @tensor_shape.setter
+    def tensor_shape(self, tensor_shape: torch.Size):
+        self._tensor_shape = tensor_shape
+        self._input_obj_shapes = [self._tensor_shape for _ in range(self._num_chunks)]
+        self._send_tensor_shape_flags = [self._tensor_shape is None for _ in range(self._num_chunks)]
+    def _clear_state(self) -> None:
+        self._accum_loss = None
+        self._return_tensors = None
+        self._input_objs = [[] for _ in range(self._num_chunks)]
+        self._output_objs = [[] for _ in range(self._num_chunks)]
+        self._output_obj_grads = [[] for _ in range(self._num_chunks)]
+        self._input_obj_shapes = [self.tensor_shape for _ in range(self._num_chunks)]
+        self._output_obj_shapes = [None for _ in range(self._num_chunks)]
+        self._send_tensor_shape_flags = [self.tensor_shape is None for _ in range(self._num_chunks)]
+    def load_batch(self, engine, data_iter):
+        super().load_batch(engine, data_iter)
+        # overwrite microbatch_offset, since model chunks load the same microbatch, and should tract the offset
+        self.microbatch_offset = [0 for _ in range(self._num_chunks)]
+    def load_micro_batch(self, model_chunk_id):
+        micro_batch_data, micro_batch_label = self._load_micro_batch(
+            data=self.batch_data,
+            label=self.batch_label,
+            offset=self.microbatch_offset[model_chunk_id],
+            micro_bsz=self.microbatch_size,
+        )
+        micro_batch_data["label"] = micro_batch_label
+        self.microbatch_offset[model_chunk_id] += self.microbatch_size
+        return move_to_device(micro_batch_data)
+    def _forward_step(self, engine, chunk_id):
+        """Forward step for passed-in model. If it is the first stage, the input tensor
+        is obtained from data_iterator, otherwise the passed-in input_obj is used.
+        Returns output tensor. This is a helper function and can be ignored by users.
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            chunk_id (int): The id of model chunks.
+        Returns:
+            Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: output or the loss value of the current
+                pipeline stage.
+        """
+        gpc.set_virtual_pipeline_parallel_rank(chunk_id)
+        if gpc.is_pipeline_first_stage() and len(self._input_objs[chunk_id]) == len(self._output_objs[chunk_id]):
+            self._input_objs[chunk_id].append(None)
+        input_obj = self._input_objs[chunk_id][-1]
+        micro_batch_data = self.load_micro_batch(chunk_id)
+        data, label = self._get_data_label_for_current_step(input_obj, micro_batch_data)
+        self._call_hooks("before_forward", data)
+        output_obj = self._call_engine(engine.model[chunk_id], data)
+        # Convert output_obj to fp32 when last model chunk of last stage
+        if gpc.is_pipeline_last_stage(ignore_virtual=False) and isinstance(engine.model[chunk_id], NaiveAMPModel):
+            output_obj = engine.model[chunk_id].convert_to_fp32(output_obj)
+        self._call_hooks("after_forward", output_obj)
+        if gpc.is_pipeline_last_stage():
+            self._call_hooks("post_helper_func", output_obj, label)
+            if self._return_tensors is not None:
+                self._return_tensors.append((output_obj, label))
+            if self._accum_loss is not None:
+                self._call_hooks("before_criterion", output_obj, label)
+                loss = self._call_engine_criterion(engine.criterion, output_obj, label)
+                self._call_hooks("after_criterion", loss)
+                loss_reduced = loss / self.num_microbatches
+                self._accum_loss.add_(loss_reduced.detach())
+                output_obj = loss_reduced
+        self._output_objs[chunk_id].append(output_obj)
+        return output_obj
+    def _backward_step(self, engine, chunk_id, step_id):
+        """
+        Backward step for passed-in model. If it is the last stage, the input tensor
+        is obtained from the previous forward step, otherwise the passed-in input_obj is used.
+        Returns input tensor gradient. This is a helper function and can be ignored by users.
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            chunk_id (int): The id of model chunks.
+            step_id (int): The current step id.
+        Returns:
+            Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: input tensor gradient.
+        """
+        gpc.set_virtual_pipeline_parallel_rank(chunk_id)
+        if gpc.is_pipeline_last_stage() and len(self._output_obj_grads[chunk_id]) == 0:
+            self._output_obj_grads[chunk_id].append(None)
+        input_obj = self._input_objs[chunk_id].pop(0)
+        output_obj = self._output_objs[chunk_id].pop(0)
+        output_obj_grad = self._output_obj_grads[chunk_id].pop(0)
+        input_obj_grad = super()._backward_step(engine, step_id, input_obj, output_obj, output_obj_grad)
+        return input_obj_grad
+    def _get_chunk_by_microbatch(self, step_id: int, backward: bool = False) -> int:
+        """Helper method to get the model chunk ID given the iteration number."""
+        microbatch_id_in_group = step_id % (self._pp_size * self._num_chunks)
+        chunk_id = microbatch_id_in_group // self._pp_size
+        if backward:
+            chunk_id = self._num_chunks - chunk_id - 1
+        return chunk_id
+    def _get_current_microbatch_id(self, step_id: int) -> int:
+        # format:
+        # microstep_id : 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
+        # microbatch_id: 1  2  3  4  1  2  3  4  5  6  7  8  5  6  7  8
+        num_microbatch_group = step_id // (self._pp_size * self._num_chunks)
+        step_id_in_group = step_id % (self._pp_size * self._num_chunks)
+        microbatch_id = num_microbatch_group * self._pp_size + step_id_in_group % self._pp_size
+        return microbatch_id
+    def _run_warmup_loop(
+        self,
+        engine: Engine,
+        num_microsteps: int,
+        num_warmup_microsteps: int,
+        receive_extra_backward: bool = False,
+        forward_only: bool = False,
+    ) -> None:
+        """
+        Run the warm-up loop and prepare data for the 1F1B stage.
+        During the warm-up process, for each execution, it first performs a forward computation,
+        and then sends the computation result to the next stage.
+        It also receives data for the next forward computation.
+        Since the input for the first forward computation is not considered initially,
+        it needs to receive data once at the beginning.
+        After the warm-up is completed, we need to prepare data for the 1F1B stage.
+        The data preparation process should be consistent with the communication method of the 1F1B stage.
+        Args:
+            engine (Engine): The engine to run the warm-up loop.
+            num_microsteps (int): The total number of microsteps.
+            num_warmup_microsteps (int): The number of warm-up microsteps.
+            receive_extra_backward (bool, optional): Whether to receive extra backward input for the 1F1B stage.
+                                                     Default is False.
+            forward_only (bool, optional): Whether to only perform forward pass. Default is False.
+        """
+        if not gpc.is_pipeline_first_stage():
+            if self._input_obj_shapes[0] is None:
+                self._input_obj_shapes[0] = comm.recv_obj_meta(self._input_obj_shapes[0])
+            self._input_objs[0].append(
+                comm.recv_forward(
+                    self._input_obj_shapes[0],
+                    dtype=self.dtype,
+                    scatter_gather_tensors=self.scatter_gather_tensors,
+                )
+            )
+        else:
+            self._input_objs[0].append(None)
+        for k in range(num_warmup_microsteps):
+            chunk_id = self._get_chunk_by_microbatch(k)
+            output_obj = self._forward_step(engine, chunk_id)
+            if forward_only:
+                # when forward-only, no need to save tensors for a backward pass
+                self._input_objs[chunk_id].pop()
+                self._output_objs[chunk_id].pop()
+            if not gpc.is_pipeline_last_stage():
+                if isinstance(output_obj, torch.Tensor):
+                    self._output_obj_shapes[chunk_id] = output_obj.shape
+                else:
+                    self._output_obj_shapes[chunk_id] = [out_tensor.shape for out_tensor in output_obj]
+                if self._send_tensor_shape_flags[chunk_id]:
+                    comm.send_obj_meta(output_obj)
+                    self._send_tensor_shape_flags[chunk_id] = False  # send only once for each chunk.
+            # Determine if tensor should be received from previous stage.
+            next_forward_chunk_id = self._get_chunk_by_microbatch(k + 1)
+            with switch_virtual_pipeline_parallel_rank(next_forward_chunk_id):
+                if not gpc.is_pipeline_first_stage() and self._input_obj_shapes[next_forward_chunk_id] is None:
+                    self._input_obj_shapes[next_forward_chunk_id] = comm.recv_obj_meta()
+                if k == (num_microsteps - 1) or gpc.is_pipeline_first_stage():
+                    input_shape = None
+                else:
+                    input_shape = self._input_obj_shapes[next_forward_chunk_id]
+            # Don't send tensor downstream if on last stage.
+            if gpc.is_pipeline_last_stage():
+                output_obj = None
+            # Send and receive tensors as appropriate (send tensors computed
+            # in this iteration; receive tensors for next iteration).
+            if k != (num_warmup_microsteps - 1) or not receive_extra_backward:
+                # Normal warm-up communication process, or no need to prepare backward input for the 1F1B stage
+                input_obj = comm.send_forward_recv_forward(
+                    output_obj,
+                    input_shape,
+                    dtype=self.dtype,
+                    scatter_gather_tensors=self.scatter_gather_tensors,
+                )
+            else:
+                # Receive output_obj_grad for next backward, if receive_extra_backward is True.
+                if self._communication_overlap:
+                    # In this case, we should handle forward and backward communication separately, consistent with the
+                    # overlap version of the 1F1B stage
+                    input_obj = comm.send_forward_recv_forward(
+                        output_obj,
+                        input_shape,
+                        dtype=self.dtype,
+                        scatter_gather_tensors=self.scatter_gather_tensors,
+                    )
+                    output_obj_grad = comm.send_backward_recv_backward(
+                        None,  # nothing to send
+                        self._output_obj_shapes[self._num_chunks - 1],
+                        dtype=self.dtype,
+                        scatter_gather_tensors=self.scatter_gather_tensors,
+                    )
+                    self._output_obj_grads[self._num_chunks - 1].append(output_obj_grad)
+                else:
+                    # In this case, we should handle forward and backward communication together, consistent with the
+                    # non-overlap version of the 1F1B stage
+                    input_obj, output_obj_grad = comm.send_forward_backward_recv_forward_backward(
+                        output_obj,
+                        None,  # no backward grad to send
+                        input_shape,
+                        self._output_obj_shapes[self._num_chunks - 1],
+                        dtype=self.dtype,
+                        scatter_gather_tensors=self.scatter_gather_tensors,
+                    )
+                    self._output_obj_grads[self._num_chunks - 1].append(output_obj_grad)
+            self._input_objs[next_forward_chunk_id].append(input_obj)
+    def _run_1f1b_loop_with_overlap(
+        self,
+        engine: Engine,
+        num_warmup_microsteps: int,
+        num_1f1b_micropairs: int,
+        all_warmup_microsteps: bool = False,
+    ) -> None:
+        """
+        Run the 1F1B loop with overlap.
+        The 1F1B loop with overlap consists of the following steps:
+        1. Perform the forward pass.
+        2. Check if the backward input is ready.
+        3. Send the forward output and receive the forward input for the next iteration.
+        4. Perform the backward pass.
+        5. Check if the forward input is ready.
+        6. Send the backward output and receive the backward input for the next iteration.
+        Args:
+            engine (Engine): The engine to run the 1F1B loop.
+            num_warmup_microsteps (int): The number of warm-up microsteps.
+            num_1f1b_micropairs (int): The number of 1F1B micropairs.
+            all_warmup_microsteps (bool, optional): Whether to run all warm-up microsteps. Default is False.
+        """
+        backward_async_communicator = None
+        # Run 1F1B in steady state.
+        for k in range(num_1f1b_micropairs):
+            forward_microstep_id = k + num_warmup_microsteps
+            backward_microstep_id = k
+            forward_chunk_id = self._get_chunk_by_microbatch(forward_microstep_id)
+            backward_chunk_id = self._get_chunk_by_microbatch(backward_microstep_id, backward=True)
+            # 1. Forward pass.
+            output_obj = self._forward_step(engine, forward_chunk_id)
+            # 2. Check if the backward input is ready.
+            if backward_async_communicator is not None:
+                output_obj_grad = backward_async_communicator.wait_and_receive()
+                if backward_async_communicator.need_receive:
+                    self._output_obj_grads[backward_chunk_id].append(output_obj_grad)
+            # 3. Send the forward outputs and receive the forward inputs from the previous rank.
+            # Check if it is the last model chunk of the last pipeline stage, no need to send forward output.
+            gpc.set_virtual_pipeline_parallel_rank(forward_chunk_id)
+            if gpc.is_pipeline_last_stage():
+                output_obj = None
+            # Check if it needs to receive the results from the previous rank.
+            next_forward_chunk_id = self._get_chunk_by_microbatch(forward_microstep_id + 1)
+            with switch_virtual_pipeline_parallel_rank(next_forward_chunk_id):
+                if gpc.is_pipeline_first_stage() or k == num_1f1b_micropairs - 1:
+                    input_obj_shape = None
+                else:
+                    input_obj_shape = self._input_obj_shapes[next_forward_chunk_id]
+            forward_async_communicator = comm.AsynCommunicator(
+                output_obj,
+                input_obj_shape,
+                self.dtype,
+                self.scatter_gather_tensors,
+                forward=True,
+            )
+            forward_async_communicator.start()
+            # 5. Backward pass.
+            input_obj_grad = self._backward_step(engine, backward_chunk_id, backward_microstep_id)
+            input_obj = forward_async_communicator.wait_and_receive()
+            if forward_async_communicator.need_receive:
+                self._input_objs[next_forward_chunk_id].append(input_obj)
+            # 6. Send the backward output and receive the backward input for the next iteration.
+            gpc.set_virtual_pipeline_parallel_rank(backward_chunk_id)
+            if gpc.is_pipeline_first_stage():
+                input_obj_grad = None
+            next_backward_chunk_id = self._get_chunk_by_microbatch(backward_microstep_id + 1, backward=True)
+            with switch_virtual_pipeline_parallel_rank(next_backward_chunk_id):
+                if gpc.is_pipeline_last_stage():
+                    output_obj_shape = None
+                else:
+                    output_obj_shape = self._output_obj_shapes[next_backward_chunk_id]
+            backward_async_communicator = comm.AsynCommunicator(
+                input_obj_grad,
+                output_obj_shape,
+                self.dtype,
+                self.scatter_gather_tensors,
+                forward=False,
+            )
+            backward_async_communicator.start()
+        if all_warmup_microsteps:
+            if not gpc.is_pipeline_last_stage():
+                self._output_obj_grads[self._num_chunks - 1].append(
+                    comm.recv_backward(
+                        self._output_obj_shapes[self._num_chunks - 1],
+                        dtype=self.dtype,
+                        scatter_gather_tensors=self.scatter_gather_tensors,
+                    )
+                )
+            else:
+                self._output_obj_grads[self._num_chunks - 1].append(None)
+        else:
+            output_obj_grad = backward_async_communicator.wait_and_receive()
+            if backward_async_communicator.need_receive:
+                backward_chunk_id = self._get_chunk_by_microbatch(num_1f1b_micropairs, backward=True)
+                self._output_obj_grads[backward_chunk_id].append(output_obj_grad)
+    def _run_1f1b_loop_without_overlap(
+        self,
+        engine: Engine,
+        num_warmup_microsteps: int,
+        num_1f1b_micropairs: int,
+        all_warmup_microsteps: bool = False,
+    ) -> None:
+        """
+        Run the 1F1B loop without overlap.
+        The 1F1B loop without overlap consists of the following steps:
+        1. Perform the forward pass.
+        2. Perform the backward pass.
+        3. Send the forward output of this iteration to the next stage, and send the backward output of this iteration
+           to the previous stage, and receive the forward and backward inputs for the next iteration.
+        Args:
+            engine (Engine): The engine to use for computation.
+            num_warmup_microsteps (int): The number of warmup microsteps.
+            num_1f1b_micropairs (int): The number of 1F1B micro-pairs.
+            all_warmup_microsteps (bool, optional): Whether to run all warmup microsteps. Defaults to False.
+        """
+        for k in range(num_1f1b_micropairs):
+            # Forward pass.
+            forward_microstep_id = k + num_warmup_microsteps
+            forward_chunk_id = self._get_chunk_by_microbatch(forward_microstep_id)
+            output_obj = self._forward_step(engine, forward_chunk_id)
+            # Backward pass.
+            backward_microstep_id = k
+            backward_chunk_id = self._get_chunk_by_microbatch(backward_microstep_id, backward=True)
+            input_obj_grad = self._backward_step(engine, backward_chunk_id, backward_microstep_id)
+            # Send output_obj and input_obj_grad, receive input_obj
+            # and output_obj_grad.
+            # Determine if current stage has anything to send in either direction,
+            # otherwise set obj to None.
+            gpc.set_virtual_pipeline_parallel_rank(forward_chunk_id)
+            if gpc.is_pipeline_last_stage():
+                output_obj = None
+            gpc.set_virtual_pipeline_parallel_rank(backward_chunk_id)
+            if gpc.is_pipeline_first_stage():
+                input_obj_grad = None
+            # Determine if peers are sending, and where in data structure to put
+            # received tensors.
+            next_forward_chunk_id = self._get_chunk_by_microbatch(forward_microstep_id + 1)
+            with switch_virtual_pipeline_parallel_rank(next_forward_chunk_id):
+                if gpc.is_pipeline_first_stage() or k == num_1f1b_micropairs - 1:
+                    recv_prev = False
+                else:
+                    recv_prev = True
+            next_backward_chunk_id = self._get_chunk_by_microbatch(backward_microstep_id + 1, backward=True)
+            with switch_virtual_pipeline_parallel_rank(next_backward_chunk_id):
+                if gpc.is_pipeline_last_stage():
+                    recv_next = False
+                else:
+                    recv_next = True
+            input_shape = self._input_obj_shapes[next_forward_chunk_id] if recv_prev else None
+            output_shape = self._output_obj_shapes[next_backward_chunk_id] if recv_next else None
+            # Communicate objs.
+            input_obj, output_obj_grad = comm.send_forward_backward_recv_forward_backward(
+                output_obj,
+                input_obj_grad,
+                input_shape,
+                output_shape,
+                dtype=self.dtype,
+                scatter_gather_tensors=self.scatter_gather_tensors,
+            )
+            # Put input_obj and output_obj_grad in data structures in the
+            # right location.
+            if recv_prev:
+                self._input_objs[next_forward_chunk_id].append(input_obj)
+            if recv_next:
+                self._output_obj_grads[next_backward_chunk_id].append(output_obj_grad)
+        # receive necessary data for next cooldown loop
+        if all_warmup_microsteps:
+            if not gpc.is_pipeline_last_stage():
+                self._output_obj_grads[self._num_chunks - 1].append(
+                    comm.recv_backward(
+                        self._output_obj_shapes[self._num_chunks - 1],
+                        dtype=self.dtype,
+                        scatter_gather_tensors=self.scatter_gather_tensors,
+                    )
+                )
+            else:
+                self._output_obj_grads[self._num_chunks - 1].append(None)
+    def _run_cooldown_loop(self, engine: Engine, num_microsteps: int, num_1f1b_micropairs: int) -> None:
+        """
+        Run the cooldown loop.
+        The cooldown loop consists of the following steps:
+        1. Perform the backward step.
+        2. Send the backward output to the next stage and receive inputs for next backward.
+        Args:
+            engine (Engine): The engine to use for computation.
+            num_microsteps (int): The total number of microsteps.
+            num_1f1b_micropairs (int): The number of 1F1B micro-pairs.
+        """
+        for k in range(num_1f1b_micropairs, num_microsteps):
+            chunk_id = self._get_chunk_by_microbatch(k, backward=True)
+            input_obj_grad = self._backward_step(engine, chunk_id, k)
+            next_backward_chunk_id = self._get_chunk_by_microbatch(k + 1, backward=True)
+            if k != (num_microsteps - 1) and not (
+                gpc.is_pipeline_last_stage(ignore_virtual=True) and next_backward_chunk_id == (self._num_chunks - 1)
+            ):
+                output_shape = self._output_obj_shapes[next_backward_chunk_id]
+            else:
+                output_shape = None
+            self._output_obj_grads[next_backward_chunk_id].append(
+                comm.send_backward_recv_backward(
+                    input_obj_grad,
+                    output_shape,
+                    dtype=self.dtype,
+                    scatter_gather_tensors=self.scatter_gather_tensors,
+                )
+            )
+    def _forward_only_step(self, engine: Engine):
+        num_microsteps = self.num_microbatches * self._num_chunks
+        num_warmup_microsteps = num_microsteps
+        self._run_warmup_loop(
+            engine,
+            num_microsteps,
+            num_warmup_microsteps,
+            receive_extra_backward=False,
+            forward_only=True,
+        )
+    def _forward_backward_step(self, engine: Engine):
+        # Compute number of warmup and remaining microbatches.
+        all_warmup_microsteps = False
+        num_microsteps = self.num_microbatches * self._num_chunks
+        # Run all forward passes and then all backward passes if number of
+        # microbatches is just the number of pipeline stages.
+        # Otherwise, perform (num_chunks-1)*pipeline_parallel_size on
+        # all workers, followed by more microbatches after depending on
+        # stage ID (more forward passes for earlier stages, later stages can
+        # immediately start with 1F1B).
+        if self.num_microbatches == self._pp_size:
+            num_warmup_steps = num_microsteps
+            all_warmup_microsteps = True
+        else:
+            num_warmup_steps = (self._pp_size - self._pp_rank - 1) * 2
+            num_warmup_steps += (self._num_chunks - 1) * self._pp_size
+            num_warmup_steps = min(num_warmup_steps, num_microsteps)
+        num_1f1b_micropairs = num_microsteps - num_warmup_steps
+        # We usually need to prepare an extra backward data for the 1F1B stage when the WarmUp stage ends,
+        # because the 1F1B stage typically performs one forward and backward pass together,
+        # except in the following cases:
+        receive_extra_backward = not (
+            all_warmup_microsteps  # Only warmup microsteps
+            or gpc.is_pipeline_last_stage(ignore_virtual=True)  # The rank is the last pipeline stage
+        )
+        # 1. Warmup
+        self._run_warmup_loop(
+            engine,
+            num_microsteps,
+            num_warmup_steps,
+            receive_extra_backward=receive_extra_backward,
+        )
+        # 2. 1F1B
+        self._run_1f1b_loop(
+            engine,
+            num_warmup_steps,
+            num_1f1b_micropairs=num_1f1b_micropairs,
+            all_warmup_microsteps=all_warmup_microsteps,
+        )
+        # 3. Cooldown
+        self._run_cooldown_loop(engine, num_microsteps, num_1f1b_micropairs=num_1f1b_micropairs)
+    @llm_timeout(func_name="interleaved_forward_backward_step")
+    def forward_backward_step(self, engine, data_iter, forward_only=False, return_loss=True, return_output_label=True):
+        """Run interleaved 1F1B schedule (model split into model chunks), with
+        communication between pipeline stages as needed.
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
+            forward_only (bool, optional):
+                Whether run forward step only. Default is false. If true, no backward will be run.
+            return_loss (bool, optional): Whether returns the loss value. Default is true.
+            return_output_label (bool, optional): If False, the output and label won't be returned.
+        Returns:
+            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
+                The loss would be returned only in the last stage.
+        """
+        assert (
+            forward_only or return_loss
+        ), "The argument 'return_loss' has to be True when 'forward_only' is False, but got False."
+        gpc.set_virtual_pipeline_parallel_rank(0)
+        self.load_batch(engine, data_iter)
+        if return_loss and gpc.is_pipeline_last_stage(ignore_virtual=True):
+            self._accum_loss = torch.zeros(1, device=get_current_device())
+        if return_output_label:
+            self._return_tensors = []
+        if forward_only:
+            self._forward_only_step(engine)
+        else:
+            self._forward_backward_step(engine)
+        if return_output_label and len(self._return_tensors) > 0:
+            output, label = pack_return_tensors(self._return_tensors)
+        else:
+            output, label = (None, None)
+        accum_loss = self._accum_loss
+        self._clear_state()
+        return output, label, accum_loss
+class KDPipelineScheduler(PipelineScheduler):
+    def __init__(
+            self,
+            num_microbatches: int,
+            dtype: torch.dtype = torch.float,
+            data_process_func: Callable = None,
+            tensor_shape: Union[torch.Size, List[int], Tuple[int]] = None,
+            scatter_gather_tensors: bool = False,
+            scheduler_hooks: Optional[List[SchedulerHook]] = None,
+    ):
+        super().__init__(
+            num_microbatches=num_microbatches,
+            dtype=dtype,
+            data_process_func=data_process_func,
+            tensor_shape=tensor_shape,
+            scatter_gather_tensors=scatter_gather_tensors,
+            scheduler_hooks=scheduler_hooks,
+        )
+    def _forward_step(self, engine, input_obj, return_tensors, return_output_label=True, accum_loss=None):
+        """
+        Forward step for passed-in model. If it is the first stage, the input tensor
+        is obtained from data_iterator, otherwise the passed-in input_obj is used.
+        Returns output tensor. This is a helper function and can be ignored by users.
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Input tensor for this pipeline stage.
+            return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
+            return_output_label (bool, optional): Whether returns output labels.
+            accum_loss (optional): Where accumulated loss stores.
+        Returns:
+            Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: output or the loss value of the current
+                pipeline stage.
+        """
+        micro_batch_data = self.load_micro_batch()
+        data, label = self._get_data_label_for_current_step(input_obj, micro_batch_data)
+        self._call_hooks("before_forward", data)
+        output_obj = self._call_engine(engine.model, data)
+        self._call_hooks("after_forward", output_obj)
+        if gpc.is_last_rank(ParallelMode.PIPELINE):
+            self._call_hooks("post_helper_func", output_obj, label)
+            if return_output_label:
+                return_tensors.append((output_obj, label))
+            if accum_loss is not None:
+                self._call_hooks("before_criterion", output_obj, label)
+                loss_gt = gpc.config.kd_config['gt_weight'] * self._call_engine_criterion(engine.criterion, output_obj,
+                                                                                          label)
+                with torch.no_grad():
+                    engine.teacher.eval()
+                    output_obj_t = self._call_engine(engine.teacher, data)
+                loss_kd = gpc.config.kd_config['kd_weight'] * self._call_engine_criterion(engine.kd_criterion,
+                                                                                          output_obj,
+                                                                                          (output_obj_t, label))
+                # loss = (gpc.config.kd_config['gt_weight'] * loss_gt +
+                #         gpc.config.kd_config['kd_weight'] * loss_kd)
+                self._call_hooks("after_criterion", loss_gt + loss_kd)
+                loss_gt_reduced = loss_gt / self.num_microbatches
+                loss_kd_reduced = loss_kd / self.num_microbatches
+                accum_loss['loss_gt'].add_(loss_gt_reduced.detach())
+                accum_loss['loss_kd'].add_(loss_kd_reduced.detach())
+                output_obj = loss_gt_reduced + loss_kd_reduced
+        return output_obj

InternLM/internlm/core/trainer.py ADDED Viewed

	@@ -0,0 +1,190 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
+import json
+from typing import Iterable, Optional
+from internlm.core.engine import Engine
+from internlm.core.scheduler import (
+    BaseScheduler,
+    InterleavedPipelineScheduler,
+    NonPipelineScheduler,
+    PipelineScheduler,
+)
+class TrainState:
+    """
+    The TrainState class is used to record the current state of training.
+    Args:
+        train_dl (DataLoader): The DataLoader object used for training.
+    """
+    def __init__(self, config, batch_sampler) -> None:
+        """
+        Args:
+            config (Config): internlm config
+            batch_sampler (torch.utils.data.Sampler): Because the dataloader loading is
+            asynchronous and prefetched, the batch_sampler state maintained inside the
+            dataloader are faster then the actual training progress, so we copy the
+            batch_sampler as the anchor point of ckpt reload.
+        """
+        # The number of batches produced by the data iterator
+        self.batch_count: int = 0
+        # Used to store the number of samples consumed in the current epoch
+        self.num_consumed_samples_in_epoch: int = 0
+        # Total number of tokens consumed
+        self.num_consumed_tokens: int = 0
+        # Number of batches skipped due to inf or nan values
+        self.inf_nan_skip_batches: int = 0
+        # Records the number of updates, skipped batches and inf batches are not counted
+        self.step_count: int = 0
+        # Total step count
+        self.total_steps: int = config.data.total_steps
+        # resume tensorboard folder, need load from checkpoint or set manually.
+        self.resume_tb_folder = config.resume_tb_folder
+        self.tensorboard_folder = config.tensorboard_folder
+        # learning rate
+        self.lr = config.adam.lr
+        # smapler state
+        if batch_sampler:
+            self.init_batch_sampler(batch_sampler)
+    def init_batch_sampler(self, batch_sampler):
+        """
+        Args:
+            batch_sampler (torch.utils.data.Sampler): sampler.
+        """
+        # make a copy of batch_sampler.
+        self.batch_sampler = batch_sampler.copy()
+        # Iterator for the batch sampler
+        self.batch_sampler_iter = iter(self.batch_sampler)
+    def __str__(self) -> str:
+        """Returns a string representation of the training state in JSON format."""
+        info = {
+            "batch_count": self.batch_count,
+            "num_consumed_samples_in_epoch": self.num_consumed_samples_in_epoch,
+            "num_consumed_tokens": self.num_consumed_tokens,
+            "inf_nan_skip_batches": self.inf_nan_skip_batches,
+            "step_count": self.step_count,
+        }
+        return json.dumps(info, indent=4, sort_keys=True)
+    def load_state_dict(self, other_stuffs):
+        """
+        Resumes training from a checkpoint.
+        Args:
+            other_stuffs (dict): Other information needed to resume training.
+        """
+        self.num_consumed_samples_in_epoch = other_stuffs["num_consumed_samples_in_epoch"]
+        self.num_consumed_tokens = other_stuffs["num_consumed_tokens"]
+        self.inf_nan_skip_batches = other_stuffs["inf_nan_skip_batches"]
+        # Because the ckpt save occurs after updating 'step_count',
+        # there is no need to increment 'step_count' here (Does our step count start from 0 ?),
+        # However, 'batch_count' is updating before ckpt storage, so it need to inc 1 when resume.
+        self.batch_count = other_stuffs["batch_count"] + 1  # here you need to shift a batch backward
+        self.step_count = other_stuffs.get("step_count", self.batch_count)
+        # resume tensorboard from older tensorboard_folder
+        self.resume_tb_folder = other_stuffs.get("tensorboard_folder", None)
+    def state_dict(self):
+        return {
+            "batch_count": self.batch_count,
+            "num_consumed_samples_in_epoch": self.num_consumed_samples_in_epoch,
+            "num_consumed_tokens": self.num_consumed_tokens,
+            "inf_nan_skip_batches": self.inf_nan_skip_batches,
+            "step_count": self.step_count,
+            "tensorboard_folder": self.tensorboard_folder,
+        }
+class Trainer:
+    """This is a class tending for easy deployments of users' training and evaluation instead of
+    writing their own scripts.
+    Args:
+        engine (:class:`Engine`): Engine responsible for the process function.
+        schedule (:class:`BaseScheduler`, optional): Runtime schedule. Defaults to None.
+    """
+    def __init__(
+        self,
+        engine: Engine,
+        schedule: Optional[BaseScheduler] = None,
+    ):
+        """Initializes the Trainer class.
+        Args:
+            engine (Engine): The engine responsible for the process function.
+            schedule (Optional[BaseScheduler], optional): The runtime schedule. Defaults to None.
+        """
+        self._engine = engine
+        # build schedule
+        if schedule is None:
+            self._schedule = NonPipelineScheduler()
+        else:
+            assert isinstance(
+                schedule, BaseScheduler
+            ), f"expected schedule to be of type BaseSchedule, but got {type(schedule)}"
+            self._schedule = schedule
+        self._schedule.pre_processing(self._engine)
+    @property
+    def engine(self):
+        """Returns the engine that responsible for managing the training and evaluation process."""
+        return self._engine
+    @property
+    def schedule(self):
+        """Returns the runtime scheduler."""
+        return self._schedule
+    @property
+    def uses_pipeline(self):
+        """Returns whether the pipeline parallel is used or not."""
+        return isinstance(self._schedule, (PipelineScheduler, InterleavedPipelineScheduler))
+    def train(self):
+        """Sets the model to training mode."""
+        self._engine.train()
+    def eval(self):
+        """Sets the model to evaluation mode."""
+        self._engine.eval()
+    def zero_grad(self):
+        """Sets the gradient of all parameters in the model to zero."""
+        self._engine.zero_grad()
+    def step(self):
+        """Executes the parameter update step."""
+        return self._engine.step()
+    def execute_schedule(self, data_iter: Iterable, **kwargs):
+        """Runs the forward, loss computation, and backward for the model.
+        Returns a tuple of (output, label, loss).
+        Args:
+            data_iter (Iterable): The data iterator.
+            **kwargs: Additional keyword arguments.
+        Returns:
+            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss).
+        """
+        output, label, loss = self._schedule.forward_backward_step(self._engine, data_iter, **kwargs)
+        return output, label, loss

InternLM/internlm/data/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .batch_sampler import get_dpsampler_dataloader
+from .collaters import jsonl_ds_collate_fn, packed_collate_fn
+from .dummy_dataset import RandomDataset
+from .packed_dataset import PackedDataset, PackedDatasetWithoutCuSeqlen
+__all__ = [
+    "jsonl_ds_collate_fn",
+    "packed_collate_fn",
+    "RandomDataset",
+    "PackedDataset",
+    "PackedDatasetWithoutCuSeqlen",
+    "get_dpsampler_dataloader",
+]

InternLM/internlm/data/batch_sampler.py ADDED Viewed

	@@ -0,0 +1,354 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import math
+import random
+from typing import Iterator, TypeVar
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset, Sampler
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.utils.logger import get_logger
+logger = get_logger(__file__)
+T_co = TypeVar("T_co", covariant=True)
+class DataParallelSampler(Sampler):
+    """A data sampler for distributed data parallelism.
+    Args:
+        dataset (:class:`torch.utils.data.Dataset`): The Dataset for sampling.
+        shuffle (bool, optional): Whether to shuffle data, defaults to False.
+        seed (int, optional): The random seed used for sampling, defaults to 0.
+        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+            is not divisible by the batch size. If False and the size of dataset is not divisible by
+            the batch size, then the last batch will be smaller, defaults to False.
+    """
+    def __init__(
+        self,
+        dataset: Dataset,
+        shuffle: bool = False,
+        seed: int = 0,
+        drop_last: bool = False,
+    ) -> None:
+        self.dataset = dataset
+        self.num_replicas = gpc.get_world_size(ParallelMode.DATA)
+        self.rank = gpc.get_local_rank(ParallelMode.DATA)
+        self.epoch = 0
+        self.drop_last = drop_last
+        # If the dataset length is evenly divisible by # of replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        # type: ignore[arg-type]
+        if self.drop_last and len(self.dataset) % self.num_replicas != 0:
+            # Split to nearest available length that is evenly divisible.
+            # This is to ensure each rank receives the same amount of data when
+            # using this Sampler.
+            self.num_samples = math.ceil(
+                # `type:ignore` is required because Dataset cannot provide a default __len__
+                # see NOTE in pytorch/torch/utils/data/sampler.py
+                (len(self.dataset) - self.num_replicas)
+                / self.num_replicas  # type: ignore[arg-type]
+            )
+        else:
+            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)  # type: ignore[arg-type]
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+        self.seed = seed
+    def __iter__(self) -> Iterator[T_co]:
+        if self.shuffle:
+            # deterministically shuffle based on epoch and seed
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            # type: ignore[arg-type]
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+            # update for next epoch so that there is no need to call
+            # set_epoch manually
+            self.epoch += 1
+        else:
+            indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+        if not self.drop_last:
+            # add extra samples to make it evenly divisible
+            padding_size = self.total_size - len(indices)
+            if padding_size <= len(indices):
+                indices += indices[:padding_size]
+            else:
+                indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
+        else:
+            # remove tail of data to make it evenly divisible.
+            indices = indices[: self.total_size]
+        assert len(indices) == self.total_size
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+        return iter(indices)
+    def __len__(self) -> int:
+        return self.num_samples
+    def set_epoch(self, epoch: int) -> None:
+        r"""Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas
+        use a different random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
+def get_dpsampler_dataloader(
+    dataset,
+    shuffle=False,
+    seed=1024,
+    add_sampler=True,
+    drop_last=False,
+    pin_memory=False,
+    num_workers=0,
+    **kwargs,
+):
+    r"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
+    Note:
+        When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
+        on the 1st stage and label on the last stage.
+    Args:
+        dataset (:class:`torch.utils.data.Dataset`): The dataset to be loaded.
+        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+        seed (int, optional): Random worker seed for sampling, defaults to 1024.
+        add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+            is not divisible by the batch size. If False and the size of dataset is not divisible by
+            the batch size, then the last batch will be smaller, defaults to False.
+        pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+        num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+        kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
+    Returns:
+        :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
+    """
+    _kwargs = kwargs.copy()
+    if add_sampler and gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1:
+        sampler = DataParallelSampler(dataset, shuffle=shuffle, drop_last=drop_last)
+    else:
+        sampler = None
+    # Deterministic dataloader
+    def seed_worker():
+        worker_seed = seed
+        np.random.seed(worker_seed)
+        torch.manual_seed(worker_seed)
+        random.seed(worker_seed)
+    if sampler is None:
+        return DataLoader(
+            dataset,
+            worker_init_fn=seed_worker,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            pin_memory=pin_memory,
+            num_workers=num_workers,
+            **_kwargs,
+        )
+    else:
+        return DataLoader(
+            dataset,
+            sampler=sampler,
+            worker_init_fn=seed_worker,
+            drop_last=drop_last,
+            pin_memory=pin_memory,
+            num_workers=num_workers,
+            **_kwargs,
+        )
+class StaticBatchSampler:
+    """
+    A static batch sampler that generates batches with a fixed micro-batch size.
+    Args:
+        num_samples (int): The total number of samples in the dataset.
+        batch_size (int): The batch size for the current rank. Defaults to 192.
+        rampup_batch_size (str): A string with three space-separated integers representing the
+                                 starting batch size, the increment, and the number of steps between
+                                 each increment. For tools, "192 24 8" means that the batch size
+                                 starts at 192 and increases by 24 every 8 steps. Defaults to
+                                 "6 2 8", which corresponds to a batch size of 2 for the first 6 steps.
+        micro_bsz (int): The micro-batch size. Defaults to 2.
+        seed (int): The random seed for shuffling the indices. Defaults to 0.
+        drop_last (bool): If True, drop the last incomplete batch. Currently only supports True. Defaults to True.
+        data_rank (int): The rank of the current process in the data parallel group. Defaults to 0.
+        data_world_size (int): The number of processes in the data parallel group. Defaults to 1.
+    """
+    def __init__(
+        self,
+        datasets,
+        batch_size=192,
+        rampup_batch_size="6 2 8",
+        micro_bsz=2,
+        seed=0,
+        drop_last=True,
+        data_rank=0,
+        data_world_size=1,
+    ):
+        assert drop_last is True, "Currently only support drop last"
+        if rampup_batch_size:
+            # In the process increase to batch_size
+            start_bsz, bsz_incre, incre_every = map(int, rampup_batch_size.split())
+        else:
+            start_bsz, bsz_incre, incre_every = batch_size, batch_size, 1
+        self.raw_rampup_batch_size = rampup_batch_size
+        self.start_bsz = start_bsz
+        self.bsz_incre = bsz_incre
+        self.incre_every = incre_every
+        if gpc.is_initialized(ParallelMode.PIPELINE):
+            assert (
+                batch_size - self.start_bsz
+            ) % self.bsz_incre == 0, f"{batch_size} - {self.start_bsz} should be multiple of {self.bsz_incre}"
+            assert batch_size % micro_bsz == 0, f"batch_size({batch_size}) should be multiple of micro_bsz({micro_bsz})"
+            assert (
+                self.start_bsz % micro_bsz == 0
+            ), f"start_bsz({self.start_bsz}) should be multiple of micro_bsz({micro_bsz})"
+            assert (
+                self.bsz_incre % micro_bsz == 0
+            ), f"bsz_incre({self.bsz_incre}) should be multiple of micro_bsz({micro_bsz})"
+        self.batch_size = batch_size
+        self.epoch = 0
+        self.seed = seed
+        self.rng = np.random.RandomState(seed)
+        self.batch_count = 0
+        self.micro_bsz = micro_bsz
+        self.data_rank = data_rank
+        self.data_world_size = data_world_size
+        self.num_consumed_samples_in_epoch = 0
+        self.datasets = datasets
+        self.num_samples = sum([len(ds) for ds in datasets])
+        self.get_indices()  # get data
+    def get_indices(self, old_indices=None):
+        if old_indices is not None:
+            assert (
+                len(old_indices) <= self.num_samples
+            ), f"The checkpoint has {len(old_indices)} samples, \
+while the new restart use less samples ({self.num_samples})"
+        else:
+            old_indices = np.array([])
+        # indices includes len(old_indices) but not self.num_samples
+        indices = np.arange(len(old_indices), self.num_samples)
+        self.rng_state = self.rng.get_state()
+        self.rng.shuffle(indices)
+        # Need to consider drop_last
+        ramp_steps = (self.batch_size - self.start_bsz) // self.bsz_incre
+        if self.batch_count < ramp_steps * self.incre_every:
+            rampup_samples = 0
+            for i in range(ramp_steps):
+                rampup_samples += (i * self.bsz_incre + self.start_bsz) * self.incre_every
+            assert (
+                rampup_samples * self.data_world_size <= self.num_samples
+            ), f"Too much rampup samples: \
+{rampup_samples*self.data_world_size} Vs. self.num_samples: {self.num_samples}"
+            num_samples = (self.num_samples - rampup_samples * self.data_world_size) // (
+                self.batch_size * self.data_world_size
+            )
+            num_samples = num_samples * self.batch_size * self.data_world_size + rampup_samples * self.data_world_size
+        else:
+            num_samples = self.num_samples // (self.batch_size * self.data_world_size)
+            num_samples = num_samples * self.batch_size * self.data_world_size
+        indices = np.concatenate([old_indices, indices]).astype(int)  # It needs to be spliced with the previous
+        indices = indices[:num_samples]
+        self.indices = indices
+        assert len(self.indices) >= self.batch_size, "The number of samples should be larger than batch_size"
+        self.num_consumed_samples_in_epoch = 0
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+        self.rng = np.random.RandomState(self.seed + self.epoch)
+    def __len__(self):
+        ramp_steps = (self.batch_size - self.start_bsz) // self.bsz_incre
+        if self.batch_count < ramp_steps * self.incre_every:
+            rampup_samples = 0
+            for i in range(ramp_steps):
+                rampup_samples += (i * self.bsz_incre + self.start_bsz) * self.incre_every
+            assert (
+                rampup_samples * self.data_world_size <= self.num_samples
+            ), f"Too much rampup samples: {rampup_samples*self.data_world_size} \
+Vs. self.num_samples: {self.num_samples}"
+            num_batches = (self.num_samples - rampup_samples * self.data_world_size) // self.batch_size
+            num_batches = num_batches // self.data_world_size + self.incre_every * ramp_steps
+        else:
+            num_batches = self.num_samples // self.batch_size // self.data_world_size
+        return num_batches
+    def __iter__(self):
+        indices = self.indices[self.data_rank :: self.data_world_size]
+        while self.num_consumed_samples_in_epoch < len(indices):
+            batch_rampup_idx = self.batch_count // self.incre_every
+            cur_batch_size = batch_rampup_idx * self.bsz_incre + self.start_bsz
+            cur_batch_size = min(cur_batch_size, self.batch_size)
+            batch = indices[self.num_consumed_samples_in_epoch : self.num_consumed_samples_in_epoch + cur_batch_size]
+            yield batch
+            self.num_consumed_samples_in_epoch += len(batch)  # Consider multiple processes.
+            self.batch_count += 1
+        self.get_indices()  # get a new round
+    def state_dict(self):
+        states = {
+            "batch_size": self.batch_size,
+            "raw_rampup_batch_size": self.raw_rampup_batch_size,
+            "rng_state": self.rng_state,
+            "epoch": self.epoch,
+            "seed": self.seed,
+            "data_world_size": self.data_world_size,
+            "num_consumed_samples_in_epoch": self.num_consumed_samples_in_epoch,
+            "batch_count": self.batch_count,  # The batch_count here is due to the existence of multiple processes,
+            # the batch may be oversent, and it needs to be overwritten by the external batch_count
+            "indices": self.indices,  # The sequence used to breakpoint retraining is the same as before
+        }
+        return states
+    def load_state_dict(self, states):
+        for name in ("data_world_size", "raw_rampup_batch_size", "seed"):  # 'batch_size'
+            assert states[name] == getattr(self, name), (name, states[name], getattr(self, name))  # should not change
+        self.rng.set_state(states["rng_state"])
+        self.get_indices(old_indices=None)  # Regenerate indices based on random state
+        self.epoch = states["epoch"]
+        self.batch_count = states["batch_count"]
+        self.num_consumed_samples_in_epoch = states["num_consumed_samples_in_epoch"]
+    def copy(self):
+        copy_sampler = StaticBatchSampler(
+            self.datasets,
+            self.batch_size,
+            self.raw_rampup_batch_size,
+            self.micro_bsz,
+            self.seed,
+            drop_last=True,
+            data_rank=self.data_rank,
+            data_world_size=self.data_world_size,
+        )
+        copy_sampler.load_state_dict(self.state_dict())
+        return copy_sampler

InternLM/internlm/data/collaters.py ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import torch
+def packed_collate_fn(batch, packed_length):
+    """
+    Collate function for packed input sequences.
+    Args:
+        batch (List[Dict]): List of dictionaries representing each sample in batch.
+            Each dictionary contains "tokens", "labels", "type_ids", "cu_seqlens", and "indexes" keys.
+        packed_length (int): The length of packed sequence.
+    Returns:
+        Tuple[Dict[str, torch.Tensor], torch.Tensor]: A tuple containing a dictionary of tensors with "input_ids",
+            "cu_seqlens", "indexes", and "type_ids" keys, and the tensor of padded "labels".
+    Raises:
+        AssertionError: If the length of a sample is not equal to packed_length.
+        AssertionError: If the shape of the padded "input_ids" tensor does not have the correct shape.
+    """
+    xs, ys, cu_seqlens, indexes, ts = [], [], [], [], []
+    for b in batch:
+        assert (
+            len(b["tokens"]) == packed_length
+        ), f"length of a sample should be equal to packed_length, but got {len(b['tokens'])} and {packed_length})"
+        assert (
+            len(b["labels"]) == packed_length
+        ), f"length of a sample should be equal to packed_length, but got {len(b['labels'])} and {packed_length})"
+        assert (
+            len(b["type_ids"]) == packed_length
+        ), f"length of a sample should be equal to packed_length, but got {len(b['type_ids'])} and {packed_length})"
+        tokens = [abs(w) for w in b["tokens"]]
+        labels = [w if w > 0 else -100 for w in b["labels"]]
+        xs.append(torch.LongTensor(tokens))
+        # The labels have been shifted here, so they are aligned with the output corresponding to the token
+        ys.append(torch.LongTensor(labels))
+        ts.append(torch.LongTensor(b["type_ids"]))
+        cu_seqlens.append(torch.IntTensor(b["cu_seqlens"]))
+        indexes.append(torch.LongTensor(b["indexes"]))
+    xs = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True)
+    ys = torch.nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=-100)
+    ts = torch.nn.utils.rnn.pad_sequence(ts, batch_first=True, padding_value=0)
+    indexes = torch.stack(indexes, dim=0)
+    if len(set(map(len, cu_seqlens))) == 1:  # if has uniform length, then stack to save device transfer time
+        cu_seqlens = torch.stack(cu_seqlens, dim=0)
+    assert xs.shape[1] == packed_length, (xs.shape[1], packed_length)
+    return {"input_ids": xs, "cu_seqlens": cu_seqlens, "indexes": indexes, "type_ids": ts}, ys
+def jsonl_ds_collate_fn(batch, max_length_per_sample):
+    """
+    Collate function for json dataset.
+    Args:
+        batch (List[Dict]): List of dictionaries representing each sample in batch.
+            Each dictionary contains "tokens".
+        max_length_per_sample (int): The length of output sequence.
+    Returns:
+        Tuple[Dict[str, torch.Tensor], torch.Tensor]: A tuple containing a dictionary of tensors with "input_ids",
+        and the tensor of padded "labels".
+    """
+    xs, ys = [], []
+    for x in batch:
+        x["tokens"] = x["tokens"][:max_length_per_sample]
+        tokens = [abs(w) for w in x["tokens"]]
+        labels = [w if w > 0 else -100 for w in x["tokens"]]
+        labels = labels[1:] + [-100]
+        xs.append(torch.as_tensor(tokens))
+        ys.append(torch.as_tensor(labels))  # y has been shifted
+    xs = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True)
+    ys = torch.nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=-100)
+    xs = torch.cat([xs, xs.new_zeros(len(xs), max_length_per_sample - len(xs[0]))], dim=-1)
+    ys = torch.cat([ys, ys.new_full((len(ys), max_length_per_sample - len(ys[0])), fill_value=-100)], dim=-1)
+    return {"input_ids": xs}, ys

InternLM/internlm/data/dataset.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+from typing import Dict
+from torch.utils.data import ConcatDataset
+from internlm.data.single_dataset import JsonlDataset
+def get_dataset_dict(folder, split="valid") -> Dict:
+    """
+    Return a dictionary of Datasets from a folder containing data files for validation.
+    Args:
+        folder (str): The path to the folder containing data files.
+        split (str): The split of the data files to be used, default is "valid".
+    Returns:
+        A dictionary containing Datasets for each folder in the given path
+        that contains data files with the specified split.
+    Raises:
+        AssertionError: If the given folder does not exist.
+    Example:
+        If the given folder is as follows,
+        - data
+            - zhihu
+                - xxx.bin
+                - valid.bin
+            - baike
+                - xxx.bin
+                - valid.bin
+        The returned dictionary will be,
+        {
+            'zhihu': Dataset,
+            'baike': Dataset
+        }
+    """
+    assert os.path.exists(folder), f"folder `{folder}` not exists"
+    data_dict = {}
+    for root, dirs, files in os.walk(folder, followlinks=True):
+        dirs.sort()  # The order is guaranteed, and the newly added data starting with z needs to be ranked behind
+        datasets = []
+        for fn in sorted(files):  # Need sorted to ensure that the order is consistent
+            if fn.endswith(".bin") and split in fn:
+                fp = os.path.join(root, fn)
+                ds = JsonlDataset(fp)
+                datasets.append(ds)
+        if datasets:
+            ds = ConcatDataset(datasets=datasets)
+            data_dict[os.path.basename(root)] = ds
+    return data_dict

InternLM/internlm/data/dummy_dataset.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import numpy as np
+from torch.utils.data import Dataset
+class RandomDataset(Dataset):
+    """
+    RandomDataset for generating random dataset.
+    Args:
+        num_samples (int): The number of samples to generate.
+        max_len (int): The maximum length of each sample.
+    """
+    def __init__(self, num_samples=10000, max_len=1024) -> None:
+        super().__init__()
+        rng = np.random.RandomState(1999)
+        max_num = rng.randint(1, 30, size=(num_samples,))
+        rep_num = rng.randint(10, 200, size=(num_samples,))
+        data = []
+        lengths = []
+        for n, r in zip(max_num, rep_num):
+            d = list(range(n)) * r
+            d = [n, r] + d
+            d = d[:max_len]
+            data.append(d)
+            lengths.append(len(d))
+        self.data = data
+        self.max_len = max_len
+        self.lengths = np.array(lengths, dtype=int)
+    def __getitem__(self, index):
+        d = self.data[index]
+        input_ids = np.array(d, dtype=int)
+        return {"tokens": list(input_ids), "type_id": 0}
+    def get_dataset_name(self):
+        return "dummy_path/dummy_lang/dummy_ds/train.bin"
+    def __len__(self):
+        return len(self.data)

InternLM/internlm/data/packed_dataset.py ADDED Viewed

	@@ -0,0 +1,421 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import itertools as it
+import operator
+import os
+from copy import deepcopy
+from typing import Dict
+import numpy as np
+import torch
+from torch.utils.data import ConcatDataset
+from tqdm import tqdm
+from internlm.core.context import global_context as gpc
+from internlm.data.single_dataset import JsonlDataset
+from internlm.data.utils import get_dataset_type_id
+from internlm.utils.logger import get_logger
+DEFAULT_SEED = 1024
+logger = get_logger(__file__)
+class PackedDataset(torch.utils.data.Dataset):
+    """
+    The class PackedDataset takes in a dataset and aggregates samples of different
+    lengths together based on the packed_length.
+    Args:
+        dataset: The original dataset to pack.
+        max_length_per_sample: The maximum length of each original sample. Default is 2048.
+        packed_length: The length of each packed sample. Default is 4096.
+    """
+    def __init__(
+        self,
+        dataset,
+        max_length_per_sample: int = 2048,
+        packed_length: int = 4096,
+    ):
+        assert hasattr(dataset, "lengths")
+        assert len(getattr(dataset, "lengths")) == len(
+            dataset
+        ), "The dataset must have lengths attribute and have the same length as the dataset"
+        self.dataset = dataset
+        self.max_length_per_sample = max_length_per_sample
+        self.lengths = getattr(self.dataset, "lengths")
+        self.packed_length = packed_length
+        # Force a seed to be fixed to prevent problems caused by the seed not being restored when restarting
+        self.seed = DEFAULT_SEED
+        self.sample_indices, self.len_samples_shuffled, self.acm_len_samples = self.accu_sample_len(seed=self.seed)
+        self.num_tokens = sum(self.lengths)
+    def get_dataset_name(self):
+        return self.dataset.get_dataset_name()
+    def accu_sample_len(self, seed=None):
+        """accumulative length of samples"""
+        if seed is not None:
+            rng = np.random.RandomState(seed)
+        else:
+            rng = np.random.RandomState(self.seed - 1)
+        sample_indices = np.arange(len(self.lengths))
+        rng.shuffle(sample_indices)
+        len_samples_shuffled = list(map(self.lengths.__getitem__, sample_indices))
+        acm_len_samples = list(it.accumulate(len_samples_shuffled, operator.add))
+        return sample_indices, len_samples_shuffled, acm_len_samples
+    def __len__(self):
+        # Line 405 of document_to_sequence.py in metaseq is directly spliced,
+        # without additional consideration of sos or eos
+        n_packs = self.num_tokens // self.packed_length
+        return n_packs
+    def cal_map(self, carriage_idx: int = 0):
+        assert carriage_idx >= 0
+        length_train = (carriage_idx + 1) * self.packed_length
+        post_pos = np.searchsorted(self.acm_len_samples, length_train, side="left")
+        return post_pos
+    def mapping(self, pack_idx: int = 0):
+        # pack_idx is zero-based
+        pre_pos, pre_token_id = 0, 0
+        if pack_idx > 0:
+            pre_pos = self.cal_map(pack_idx - 1)
+            pre_token_id = self.len_samples_shuffled[pre_pos] - (
+                self.acm_len_samples[pre_pos] - (pack_idx) * self.packed_length
+            )
+            if pre_token_id == self.len_samples_shuffled[pre_pos]:
+                pre_pos += 1
+                pre_token_id = 0
+        pos = self.cal_map(pack_idx)
+        token_id = self.len_samples_shuffled[pos] - (self.acm_len_samples[pos] - (pack_idx + 1) * self.packed_length)
+        return pre_pos, pre_token_id, pos, token_id
+    def build_pack(self, pre_pos: int, pre_token_id: int, pos: int, token_id: int):
+        pack, cu_seqlens, indexes, labels, type_ids = [], [0], [], [], []
+        while pre_pos < pos:
+            sample_idx = self.sample_indices[pre_pos]
+            sample = self.dataset[sample_idx]
+            chunk = sample["tokens"][pre_token_id:]
+            pack.extend(chunk)
+            _labels = deepcopy(chunk)
+            _labels = list(_labels[1:]) + [-100]
+            assert len(_labels) == len(chunk), (_labels, chunk)
+            labels.extend(_labels)
+            type_ids.extend([sample.get("type_id", 0)] * len(chunk))
+            num_new_samples, tokens_left = divmod(len(chunk), self.max_length_per_sample)
+            for _ in range(num_new_samples):
+                cu_seqlens.append(cu_seqlens[-1] + self.max_length_per_sample)
+                indexes.extend(list(range(self.max_length_per_sample)))
+            if tokens_left > 0:
+                cu_seqlens.append(cu_seqlens[-1] + tokens_left)
+                indexes.extend(list(range(tokens_left)))
+            pre_pos = pre_pos + 1
+            pre_token_id = 0
+        sample_idx = self.sample_indices[pos]
+        sample = self.dataset[sample_idx]
+        chunk = sample["tokens"][pre_token_id:token_id]  # fragement of a sample
+        pack.extend(chunk)
+        _labels = deepcopy(chunk)
+        if token_id == len(sample["tokens"]):
+            _labels = list(_labels[1:]) + [-100]
+        else:
+            if token_id > len(sample["tokens"]):
+                print(f"token_id {token_id}, len of sample {len(sample['tokens'])}")
+            _labels = list(_labels[1:]) + [sample["tokens"][token_id]]
+        assert len(_labels) == len(chunk), (_labels, chunk)
+        labels.extend(_labels)
+        type_ids.extend([sample.get("type_id", 0)] * len(chunk))
+        num_new_samples, tokens_left = divmod(len(chunk), self.max_length_per_sample)
+        for _ in range(num_new_samples):
+            cu_seqlens.append(cu_seqlens[-1] + self.max_length_per_sample)
+            indexes.extend(list(range(self.max_length_per_sample)))
+        if tokens_left > 0:
+            cu_seqlens.append(cu_seqlens[-1] + tokens_left)
+            indexes.extend(list(range(tokens_left)))
+        out = {"tokens": pack, "cu_seqlens": cu_seqlens, "indexes": indexes, "labels": labels, "type_ids": type_ids}
+        return out
+    def cal_pos_unpack(self, index):
+        if index == 0:
+            pre_pos = 0
+        else:
+            pre_pos = index * gpc.config.data["micro_bsz"]
+        pos = (index + 1) * gpc.config.data["micro_bsz"]
+        return pre_pos, pos
+    def build_unpack(self, index):
+        pre_pos, pos = self.cal_pos_unpack(index)
+        pack, cu_seqlens, indexes, labels, type_ids = [], [0], [], [], []
+        while pre_pos < pos and pre_pos < len(self.dataset):
+            sample_idx = self.sample_indices[pre_pos]
+            sample = self.dataset[sample_idx]
+            length = min(len(sample["tokens"]), self.max_length_per_sample)
+            chunk = sample["tokens"][0:length]
+            pack.extend(chunk)
+            _labels = deepcopy(chunk)
+            _labels = list(_labels[1:]) + [-100]
+            assert len(_labels) == len(chunk), (_labels, chunk)
+            labels.extend(_labels)
+            type_ids.extend([sample.get("type_id", 0)] * len(chunk))
+            cu_seqlens.append(cu_seqlens[-1] + len(chunk))
+            indexes.extend(list(range(length)))
+            pre_pos = pre_pos + 1
+        if cu_seqlens[-1] != self.packed_length:
+            pack = pack + [0] * (self.packed_length - cu_seqlens[-1])
+            labels = labels + [0] * (self.packed_length - cu_seqlens[-1])
+            type_ids = type_ids + [0] * (self.packed_length - cu_seqlens[-1])
+            indexes.extend(list(range(self.packed_length - cu_seqlens[-1])))
+            cu_seqlens.append(self.packed_length)
+        assert len(pack) == self.packed_length
+        out = {"tokens": pack, "cu_seqlens": cu_seqlens, "indexes": indexes, "labels": labels, "type_ids": type_ids}
+        return out
+    def __getitem__(self, item: int) -> Dict:
+        """Given the index, it returns a dict as
+        {
+         'tokens': List[int],
+         'cu_seqlens': List[int],
+         'indexes': List[int], # denotes positional vector as 'tokens'
+         'labels': List[int], # corresponds to 'tokens' and shifted yet, -100 means skipping prediction
+        }
+        """
+        if gpc.config.model.use_flash_attn:
+            pos_before, token_id_before, pos_after, token_id_after = self.mapping(item)
+            return self.build_pack(pos_before, token_id_before, pos_after, token_id_after)
+        return self.build_unpack(item)
+class PackedDatasetWithoutCuSeqlen(torch.utils.data.Dataset):
+    """
+    A dataset wrapper that aggregates samples with different lengths based on packed_length.
+    If a sample is shorter than max_length_per_sample, it will be merged with other samples.
+    For tools, given a dataset with 10 samples:
+    [1, 2, 3, 4, 5]
+    [6, 7]
+    [8, 9, 10, 11]
+    [12, ..., 100]
+    ...
+    Args:
+        dataset: The original dataset to be wrapped.
+        max_length_per_sample (int): The maximum length allowed for each sample.
+        packed_length (int): The desired length for each packed sample.
+    """
+    def __init__(
+        self,
+        dataset,
+        max_length_per_sample: int = 2048,
+        packed_length: int = 4096,
+        debug=False,
+    ):
+        assert packed_length % max_length_per_sample == 0
+        assert hasattr(dataset, "lengths")
+        assert len(getattr(dataset, "lengths")) == len(
+            dataset
+        ), "The dataset must have lengths attribute and have the same length as the dataset"
+        self.dataset = dataset
+        self.max_length_per_sample = max_length_per_sample
+        self.lengths = getattr(self.dataset, "lengths")
+        self.bsz = packed_length // max_length_per_sample
+        self.packed_length = packed_length
+        self.debug = debug
+        # Force a seed to be fixed to prevent problems caused by the seed not being restored when restarting
+        self.seed = DEFAULT_SEED
+        indices = np.arange(len(self.lengths))
+        rng = np.random.RandomState(self.seed)
+        rng.shuffle(indices)
+        self.indices = indices
+        self.cum_lens = np.cumsum(self.lengths[self.indices])
+        self.num_tokens = sum(self.lengths)
+    def get_dataset_name(self):
+        return self.dataset.get_dataset_name()
+    def __len__(self):
+        n_packs = self.num_tokens // self.packed_length
+        return n_packs
+    def find_offset(self, offset):
+        idx = np.searchsorted(self.cum_lens, offset, side="right")
+        if idx == 0:
+            return idx, offset
+        length = offset - self.cum_lens[idx - 1]
+        return idx, length
+    def pdebug(self, line):
+        if self.debug:
+            print(line, flush=True)
+    def __getitem__(self, item: int) -> Dict:
+        """Given the index, it returns a dict as
+        {
+         'tokens': List[int],
+         'cu_seqlens': List[int],
+         'indexes': List[int], # denotes positional vector as 'tokens'
+         'labels': List[int], # corresponds to 'tokens' and shifted yet, -100 means skipping prediction
+        }
+        """
+        start_idx, start_length = self.find_offset(item * self.packed_length)
+        end_idx, end_length = self.find_offset((item + 1) * self.packed_length)
+        pack_tokens = []
+        pack_labels = []
+        type_ids = []
+        self.pdebug(f"item : {item}, start_idx:{start_idx}, start_length:{start_length} ")
+        self.pdebug(f"item : {item}, end_idx:{end_idx}, end_length:{end_length} ")
+        if start_idx == end_idx:
+            idx = self.indices[start_idx]
+            sample = self.dataset[idx]
+            self.pdebug(f"item : {item}, idx: {idx}, len : {len(sample['tokens'])}")
+            tokens = sample["tokens"][start_length:end_length]
+            pack_tokens.extend(tokens)
+            pack_labels.extend(tokens[1:] + [-100])
+            type_ids.extend([sample["type_id"]] * len(tokens))
+            return {
+                "tokens": pack_tokens,
+                "cu_seqlens": [i * self.max_length_per_sample for i in range(self.bsz + 1)],
+                "indexes": list(range(self.max_length_per_sample)) * self.bsz,
+                "labels": pack_labels,
+                "type_ids": type_ids,
+            }
+        idx = self.indices[start_idx]
+        sample = self.dataset[idx]
+        self.pdebug(f"item : {item}, idx: {idx}, len : {len(sample['tokens'])}")
+        tokens = sample["tokens"][start_length:]
+        pack_tokens.extend(tokens)
+        pack_labels.extend(tokens[1:] + [-100])
+        type_ids.extend([sample["type_id"]] * len(tokens))
+        for i in range(start_idx + 1, end_idx):
+            idx = self.indices[i]
+            sample = self.dataset[idx]
+            self.pdebug(f"item : {item}, idx: {idx}, len : {len(sample['tokens'])}")
+            tokens = sample["tokens"]
+            pack_tokens.extend(tokens)
+            pack_labels.extend(tokens[1:] + [-100])
+            type_ids.extend([sample.get("type_id")] * len(tokens))
+        # corner case, the last sample is useless
+        if end_length == 0:
+            pass
+        else:
+            idx = self.indices[end_idx]
+            sample = self.dataset[idx]
+            self.pdebug(f"item : {item}, idx: {idx}, len : {len(sample['tokens'])}")
+            tokens = sample["tokens"][:end_length]
+            pack_tokens.extend(tokens)
+            pack_labels.extend(tokens[1:] + [-100])
+            type_ids.extend([sample.get("type_id")] * len(tokens))
+        return {
+            "tokens": pack_tokens,
+            "cu_seqlens": [i * self.max_length_per_sample for i in range(self.bsz + 1)],
+            "indexes": list(range(self.max_length_per_sample)) * self.bsz,
+            "labels": pack_labels,
+            "type_ids": type_ids,
+        }
+def get_packed_dataset_without_short_length(
+    folder,
+    max_length_per_sample=2048,
+    packed_length=4096,
+    show_progress=False,
+    min_length=50,
+    min_length_dict=None,
+    pack_into_one_sample=False,
+):
+    """
+    Given a folder, combine all the .bin files into a single large dataset.
+    And filter out short samples with length less than 'min_length'.
+    Each .bin file is treated as a separate dataset.
+    Args:
+        folder (str): Path to the folder containing the .bin files.
+        max_length_per_sample (int): Maximum length of each sample.
+        packed_length (int): Length to pack samples to.
+        show_progress (bool): Whether to show the progress bar.
+        min_length (int): The minimum length of the sample.
+        min_length_dict (dict): The minimum length of the sample for each dataset.
+         The format is something like {'pile-arxiv': 50}
+        dataset_backend (Optional[str]): Dataset storage location. Optional parameters are local, local-shm, kv
+    Returns:
+        A packed dataset containing all the data from the .bin files.
+    """
+    assert os.path.exists(folder), f"{folder} does not exist."
+    datasets = []
+    delete_samples = 0
+    for root, dirs, files in os.walk(folder, followlinks=True):
+        dirs.sort()  # Let the folder need to be returned in a fixed order
+        if gpc.is_rank_for_log():
+            logger.info(f"Reading {root}...")
+        num_token_in_folder = 0
+        for fn in tqdm(sorted(files), total=len(files), leave=False, disable=not show_progress):
+            if fn.endswith(".bin"):
+                fp = os.path.join(root, fn)
+                catch_ml_keys = []
+                min_length_num = min_length
+                if min_length_dict is not None:
+                    for k, v in min_length_dict.items():
+                        if k in fp:
+                            min_length_num = v
+                            catch_ml_keys.append(k)
+                    assert (
+                        len(catch_ml_keys) < 2
+                    ), f"The file name `{fp}` matched the following resample keys:{catch_ml_keys}"
+                ds_type_id = get_dataset_type_id(path=fp)
+                ds = JsonlDataset(fp, ds_type_id, min_length=min_length_num)
+                if hasattr(ds, "old_length"):
+                    delete_samples += ds.old_length - len(ds)
+                if len(ds) == 0:
+                    if gpc.is_rank_for_log():
+                        logger.info(f"None of the data in `{fp}` is longer than {min_length}")
+                    continue
+                if pack_into_one_sample:
+                    ds = PackedDatasetWithoutCuSeqlen(ds, max_length_per_sample, packed_length)
+                else:
+                    ds = PackedDataset(ds, max_length_per_sample, packed_length)
+                num_token_in_folder += len(ds) * packed_length
+                datasets.append(ds)
+    dataset = ConcatDataset(datasets=datasets)
+    if gpc.is_rank_for_log():
+        logger.info(
+            f"Find `{len(datasets)}` datasets, \
+            {len(dataset)} samples, \
+            delete `{delete_samples}` because of short length",
+        )
+    return dataset

InternLM/internlm/data/single_dataset.py ADDED Viewed

	@@ -0,0 +1,117 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+A .bin file corresponds to a Dataset instance here.
+"""
+import json
+import mmap
+import os
+import threading
+from pathlib import Path
+import numpy as np
+import torch
+class JsonlDataset(torch.utils.data.Dataset):
+    """
+    JSONL format is expected to roughly follow that of The Pile.
+    One-line-per-document of the form:
+    ```
+    {
+        "tokens": List[int],
+    }
+    ```
+    Note that only the "tokens" key is used.
+    """
+    def __init__(self, path: str, dataset_type_id: int = 0, min_length=50):
+        self.path = path
+        self.threadlocal = threading.local()
+        resolved_path = Path(path).resolve()
+        self.resolved_path = resolved_path
+        self.meta = Path(f"{resolved_path}.meta")
+        self.type_id = dataset_type_id
+        # only build the cache in on the primary worker to prevent overloading nfs
+        assert os.path.exists(self.meta), f"The cache file:{self.meta} is not found for file:{self.path}"
+        try:
+            with open(self.meta, "rb") as f:
+                meta = np.load(f)
+        except Exception as e:
+            print(f"Cannot load file {self.meta}...")
+            raise e
+        self.offsets = meta[:, 0]
+        self.lengths = meta[:, -1]
+        if min_length > 0:
+            mask = self.lengths >= min_length
+            self.old_lengths = self.lengths.copy()
+            self.old_length = len(self.offsets)
+            self.offsets = self.offsets[mask]
+            self.lengths = self.lengths[mask]
+    def __getitem__(self, idx):
+        f = self._get_mmap()
+        position = self.offsets[idx]
+        f.seek(position)
+        item = f.readline().decode("utf-8")
+        try:
+            item = json.loads(item)
+            item["length"] = len(item["tokens"])  # add a length info
+            item["type_id"] = self.type_id
+        except Exception as err:
+            raise json.decoder.JSONDecodeError(
+                doc=self.path,
+                pos=position,
+                msg=(
+                    f"Error while loading JSONL line in file {self.path} at byte "
+                    f"{position}. Contents of line:\n{item}\n{err}"
+                ),
+            )
+        return item
+    def get_dataset_name(self):
+        return str(self.resolved_path)
+    def _get_mmap(self):
+        if not hasattr(self.threadlocal, "handles"):
+            with open(self.path, "rb") as f:
+                mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
+                self.threadlocal.handles = [f, mm]
+                if self.path.endswith(".gz") or self.path.endswith(".bz") or self.path.endswith(".bz2"):
+                    raise NotImplementedError(
+                        "Compressed files are not supported because .seek() would require "
+                        "rereading the entire file, making performance too slow."
+                    )
+        return self.threadlocal.handles[-1]
+    def __setstate__(self, state):
+        self.__dict__ = state
+        self.threadlocal = threading.local()
+    def __getstate__(self):
+        d = {}
+        for i, v in self.__dict__.items():
+            if i != "threadlocal":
+                d[i] = v
+        return d
+    def __del__(self):
+        if hasattr(self.threadlocal, "handles"):
+            # cleanup files we opened on initialization
+            while self.threadlocal.handles:
+                self.threadlocal.handles.pop().close()
+    @staticmethod
+    def exists(path):
+        return os.path.exists(path)
+    def __len__(self):
+        # Virtual length of the dataset depends on the epoch number if the number of documents
+        # is not perfectly divisible by the data_subshard_count
+        return len(self.offsets)

InternLM/internlm/data/utils.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import torch
+from internlm.core.context import global_context as gpc
+DATASET_TYPE_IDS_MAP = {"vision": 0}
+def get_dataset_type_id(path):
+    import re
+    match_idxes = []
+    for key, idx in DATASET_TYPE_IDS_MAP.items():
+        if re.search(rf"/[z_]*{key}/", path):
+            match_idxes.append(idx)
+    assert len(match_idxes) == 1, f"{path}, match_idxes should be 1, but got {match_idxes} from {DATASET_TYPE_IDS_MAP}"
+    return match_idxes[0]
+def unpack_data(input_ids, cu_seqlens):
+    """
+    input_ids: (n, packed_length)
+    Return:
+    output: (batch_size, max_length)
+    """
+    bsz = input_ids.shape[0]
+    num_sequence = gpc.config.data["micro_bsz"]
+    outputs = torch.zeros(bsz, num_sequence, gpc.config.data.seq_len, device=input_ids.device, dtype=input_ids.dtype)
+    for i in range(bsz):
+        output = torch.zeros(num_sequence, gpc.config.data.seq_len, device=input_ids.device, dtype=input_ids.dtype)
+        cu_seqlens_slice = cu_seqlens[i]
+        for j in range(num_sequence):
+            seq_length = cu_seqlens_slice[j + 1] - cu_seqlens_slice[j]
+            output[j, 0:seq_length] = input_ids[0, cu_seqlens_slice[j] : cu_seqlens_slice[j + 1]]
+        outputs[i] = output
+    if bsz == 1:
+        outputs = outputs.squeeze(0)
+    return outputs

InternLM/internlm/initialize/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .initialize_trainer import initialize_trainer, initialize_kd_trainer
+from .launch import (
+    get_default_parser,
+    initialize_distributed_env,
+    launch_from_slurm,
+    launch_from_torch,
+)
+__all__ = [
+    "get_default_parser",
+    "initialize_trainer",
+    "initialize_kd_trainer",
+    "launch_from_slurm",
+    "launch_from_torch",
+    "initialize_distributed_env",
+]

InternLM/internlm/initialize/initialize_tensor.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import math
+from torch import Tensor, nn
+def scaled_init_method_normal(sigma: float = 1.0, num_layers: int = 1):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+    def init_(tensor):
+        return nn.init.normal_(tensor, mean=0.0, std=std)
+    return init_
+def normal_(mean: float = 0.0, std: float = 1.0):
+    r"""Return the initializer filling the input Tensor with values drawn from the normal distribution
+     .. math::
+        \mathcal{N}(\text{mean}, \text{std}^2)
+    Args:
+        mean (float): the mean of the normal distribution. Defaults 0.0.
+        std (float): the standard deviation of the normal distribution. Defaults 1.0.
+    """
+    def initializer(tensor: Tensor):
+        return nn.init.normal_(tensor, mean, std)
+    return initializer
+def scaled_init_method_uniform(sigma: float = 1.0, num_layers: int = 1):
+    """Init method based on p(x)=Uniform(-a, a) where std(x)=sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+    a = math.sqrt(3.0 * std)
+    def init_(tensor):
+        return nn.init.uniform_(tensor, -a, a)
+    return init_
+def uniform_(mean: float = 0.0, std: float = 1.0):
+    r"""Return the initializer filling the input Tensor with values drawn from the uniform distribution
+     .. math::
+        \mathcal{U}(mean-a, mean+a), where a satisfies \mathcal{U}_{std}=std.
+    Args:
+        mean (float): the mean of the uniform distribution. Defaults 0.0.
+        std (float): the standard deviation of the uniform distribution. Defaults 1.0.
+    """
+    a = math.sqrt(3.0 * std)
+    def initializer(tensor: Tensor):
+        return nn.init.uniform_(tensor, mean - a, mean + a)
+    return initializer

InternLM/internlm/initialize/initialize_trainer.py ADDED Viewed

	@@ -0,0 +1,235 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/initialize
+from typing import Callable, Iterable, List, Optional, Tuple
+from torch import nn
+from torch.nn.modules.loss import _Loss
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.optimizer import Optimizer
+from torch.utils.data import DataLoader
+from internlm.core.context import global_context as gpc
+from internlm.core.context import ParallelMode
+from internlm.core.engine import Engine, KDEngine
+from internlm.core.gradient_handler import PipelineSharedModuleGradientHandler
+from internlm.core.scheduler import (InterleavedPipelineScheduler, KDNonPipelineScheduler, KDPipelineScheduler,
+                                     NonPipelineScheduler, PipelineScheduler, SchedulerHook)
+from internlm.core.scheduler.pipeline_scheduler import get_tensor_shape
+from internlm.core.trainer import Trainer
+from internlm.data.utils import unpack_data
+from internlm.solver.beta2_scheduler import Beta2Scheduler
+from internlm.solver.optimizer.hybrid_zero_optim import BaseOptimizer
+from internlm.utils.common import get_current_device
+def initialize_kd_trainer(
+    model: nn.Module,
+    teacher: nn.Module,
+    optimizer: Optimizer,
+    criterion: Optional[_Loss] = None,
+    kd_criterion: Optional[_Loss] = None,
+    train_dataloader: Optional[Iterable] = None,
+    test_dataloader: Optional[Iterable] = None,
+    lr_scheduler: Optional[_LRScheduler] = None,
+    beta2_scheduler: Optional[Beta2Scheduler] = None,
+    scheduler_hooks: Optional[List[SchedulerHook]] = None,
+) -> Tuple[Trainer, DataLoader, DataLoader, _LRScheduler]:
+    """Core function to wrap the essential training components with our functionality based on the config which is
+    loaded into gpc.config.
+    Args:
+        model (:class:`torch.nn.Module` or `Callable`): Your model instance or a function to build the model.
+        optimizer (:class:`BaseOptimizer`): Your optimizer for training.
+        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
+        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
+        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
+        lr_scheduler (:class:`torch.nn.lr_scheduler._LRScheduler`, optional): Your lr scheduler instance, optional.
+    Returns:
+        Tuple (trainer, train_dataloader, test_dataloader, lr_scheduler):
+            A tuple of ``(trainer, train_dataloader, test_dataloader, lr_scheduler)``
+            where only ``trainer`` could not be None.
+    """
+    if isinstance(model, nn.Module):
+        # first sync model across dp ranks
+        model.to(get_current_device())
+    elif isinstance(model, Callable):
+        model = model().to(get_current_device())
+    # clip grad norm
+    clip_grad_norm = gpc.config.hybrid_zero_optimizer.get("clip_grad_norm", 0.0)
+    assert isinstance(optimizer, BaseOptimizer), "optimizer must be instance of BaseOptimizer"
+    # gradient handler, only support PipelineSharedModuleGradientHandler now
+    if gpc.is_using_pp():
+        gpc.config.gradient_handler = [dict(type="PipelineSharedModuleGradientHandler")]
+    gradient_handler_cfg = gpc.config.get("gradient_handler", [])
+    gradient_handlers = []
+    assert isinstance(gradient_handler_cfg, list), f"gradient_handler must be list but got {type(gradient_handler_cfg)}"
+    for config in gradient_handler_cfg:
+        if isinstance(config, dict) and config.get("type") == "PipelineSharedModuleGradientHandler":
+            handler = PipelineSharedModuleGradientHandler(model=model, optimizer=optimizer)
+            gradient_handlers.append(handler)
+    # initialize scheduler for trainer
+    scheduler = None
+    if gpc.config.model.use_flash_attn:
+        data_fn = None
+    else:
+        data_fn = unpack_data
+    if gpc.is_using_pp():
+        gpc.config.NUM_MICRO_BATCHES = gpc.config.data.micro_num
+        tensor_shape = get_tensor_shape()
+        use_interleaved = (
+                hasattr(gpc.config, "model") and hasattr(gpc.config.model,
+                                                         "num_chunks") and gpc.config.model.num_chunks > 1
+        )
+        scatter_gather = gpc.is_initialized(ParallelMode.TENSOR)
+        if use_interleaved:
+            raise NotImplementedError('InterleavedPipelineScheduler for KD is not implemented')
+        else:
+            scheduler = KDPipelineScheduler(
+                data_process_func=data_fn,
+                num_microbatches=gpc.config.NUM_MICRO_BATCHES,
+                dtype=gpc.config.model["dtype"],
+                tensor_shape=tensor_shape,
+                scatter_gather_tensors=scatter_gather,
+                scheduler_hooks=scheduler_hooks,
+            )
+    else:
+        scheduler = KDNonPipelineScheduler(
+            data_process_func=data_fn,
+            gradient_accumulation_size=gpc.config.data.gradient_accumulation,
+            scheduler_hooks=scheduler_hooks,
+        )
+    # initialize engine for trainer
+    engine = KDEngine(
+        model=model,
+        teacher=teacher,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+        beta2_scheduler=beta2_scheduler,
+        criterion=criterion,
+        kd_criterion=kd_criterion,
+        gradient_handlers=gradient_handlers,
+        clip_grad_norm=clip_grad_norm,
+    )
+    trainer = Trainer(engine, scheduler)
+    return trainer, train_dataloader, test_dataloader, lr_scheduler
+def initialize_trainer(
+        model: nn.Module,
+        optimizer: Optimizer,
+        criterion: Optional[_Loss] = None,
+        train_dataloader: Optional[Iterable] = None,
+        test_dataloader: Optional[Iterable] = None,
+        lr_scheduler: Optional[_LRScheduler] = None,
+        beta2_scheduler: Optional[Beta2Scheduler] = None,
+        scheduler_hooks: Optional[List[SchedulerHook]] = None,
+) -> Tuple[Trainer, DataLoader, DataLoader, _LRScheduler]:
+    """Core function to wrap the essential training components with our functionality based on the config which is
+    loaded into gpc.config.
+    Args:
+        model (:class:`torch.nn.Module` or `Callable`): Your model instance or a function to build the model.
+        optimizer (:class:`BaseOptimizer`): Your optimizer for training.
+        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
+        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
+        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
+        lr_scheduler (:class:`torch.nn.lr_scheduler._LRScheduler`, optional): Your lr scheduler instance, optional.
+    Returns:
+        Tuple (trainer, train_dataloader, test_dataloader, lr_scheduler):
+            A tuple of ``(trainer, train_dataloader, test_dataloader, lr_scheduler)``
+            where only ``trainer`` could not be None.
+    """
+    if isinstance(model, nn.Module):
+        # first sync model across dp ranks
+        model.to(get_current_device())
+    elif isinstance(model, Callable):
+        model = model().to(get_current_device())
+    # clip grad norm
+    clip_grad_norm = gpc.config.hybrid_zero_optimizer.get("clip_grad_norm", 0.0)
+    assert isinstance(optimizer, BaseOptimizer), "optimizer must be instance of BaseOptimizer"
+    # gradient handler, only support PipelineSharedModuleGradientHandler now
+    if gpc.is_using_pp():
+        gpc.config.gradient_handler = [dict(type="PipelineSharedModuleGradientHandler")]
+    gradient_handler_cfg = gpc.config.get("gradient_handler", [])
+    gradient_handlers = []
+    assert isinstance(gradient_handler_cfg, list), f"gradient_handler must be list but got {type(gradient_handler_cfg)}"
+    for config in gradient_handler_cfg:
+        if isinstance(config, dict) and config.get("type") == "PipelineSharedModuleGradientHandler":
+            handler = PipelineSharedModuleGradientHandler(model=model, optimizer=optimizer)
+            gradient_handlers.append(handler)
+    # initialize scheduler for trainer
+    scheduler = None
+    if gpc.config.model.use_flash_attn:
+        data_fn = None
+    else:
+        data_fn = unpack_data
+    if gpc.is_using_pp():
+        gpc.config.NUM_MICRO_BATCHES = gpc.config.data.micro_num
+        tensor_shape = get_tensor_shape()
+        use_interleaved = (
+            hasattr(gpc.config, "model") and hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1
+        )
+        scatter_gather = gpc.is_initialized(ParallelMode.TENSOR)
+        if use_interleaved:
+            if isinstance(model, nn.Sequential):
+                model = nn.ModuleList([model])
+            communication_overlap = gpc.config.parallel["pipeline"].get("interleaved_overlap", False)
+            scheduler = InterleavedPipelineScheduler(
+                num_microbatches=gpc.config.NUM_MICRO_BATCHES,
+                num_chunks=gpc.config.model.num_chunks,
+                dtype=gpc.config.model["dtype"],
+                tensor_shape=tensor_shape,
+                scatter_gather_tensors=scatter_gather,
+                scheduler_hooks=scheduler_hooks,
+                communication_overlap=communication_overlap,
+            )
+        else:
+            scheduler = PipelineScheduler(
+                data_process_func=data_fn,
+                num_microbatches=gpc.config.NUM_MICRO_BATCHES,
+                dtype=gpc.config.model["dtype"],
+                tensor_shape=tensor_shape,
+                scatter_gather_tensors=scatter_gather,
+                scheduler_hooks=scheduler_hooks,
+            )
+    else:
+        scheduler = NonPipelineScheduler(
+            data_process_func=data_fn,
+            gradient_accumulation_size=gpc.config.data.gradient_accumulation,
+            scheduler_hooks=scheduler_hooks,
+        )
+    # initialize engine for trainer
+    engine = Engine(
+        model=model,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+        beta2_scheduler=beta2_scheduler,
+        criterion=criterion,
+        gradient_handlers=gradient_handlers,
+        clip_grad_norm=clip_grad_norm,
+    )
+    trainer = Trainer(engine, scheduler)
+    return trainer, train_dataloader, test_dataloader, lr_scheduler

InternLM/internlm/initialize/launch.py ADDED Viewed

	@@ -0,0 +1,511 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import argparse
+import os
+from pathlib import Path
+from typing import Dict, Union
+import torch
+from internlm.core.context import Config
+from internlm.core.context import global_context as gpc
+from internlm.monitor import initialize_light_monitor
+from internlm.utils.common import get_master_node
+from internlm.utils.logger import get_logger
+from internlm.utils.timeout import llm_timeout
+logger = get_logger(__file__)
+def get_default_parser():
+    """Reads user command line and uses an argument parser to parse the input arguments.
+    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
+    Returns:
+       Parser: Returns the parser with the default arguments, the user may add customized arguments into this parser.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, help="path to the config file")
+    parser.add_argument(
+        "--launcher",
+        type=str,
+        default="slurm",
+        choices=["slurm", "torch"],
+        help="launcher for launching distributed environment",
+    )
+    parser.add_argument("--host", type=str, help="the master address for distributed training")
+    parser.add_argument("--port", type=int, default=8888, help="the master port for distributed training")
+    parser.add_argument("--world_size", type=int, help="world size for distributed training")
+    parser.add_argument("--rank", type=int, help="rank for the default process group")
+    parser.add_argument("--local_rank", type=int, help="local rank on the node")
+    parser.add_argument("--backend", type=str, default="nccl", help="backend for distributed communication")
+    parser.add_argument("--seed", type=int, default=1024)
+    parser.add_argument("--profiling", default=False, action="store_true", help="enable/disable profiling.")
+    return parser
+def args_sanity_check():
+    assert gpc.config is not None, "config is not load!"
+    # the default model type is INTERNLM
+    if "model_type" not in gpc.config:
+        gpc.config._add_item("model_type", "INTERNLM")
+    # procssing the parallel config in gpc
+    if "zero1" not in gpc.config.parallel:
+        gpc.config.parallel._add_item("zero1", -1)
+    if "pipeline" not in gpc.config.parallel:
+        gpc.config.parallel._add_item("pipeline", 1)
+    if "tensor" not in gpc.config.parallel:
+        gpc.config.parallel._add_item("tensor", 1)
+    # processing the data config in gpc
+    data = gpc.config.data
+    assert data.seq_len is not None, "'seq_len' must be given a value"
+    assert data.micro_bsz is not None, "'micro_bsz' must be given a value"
+    if "packed_length" in data and gpc.is_rank_for_log():
+        logger.warning("packed_length would be ignored and will be setted as seq_len * micro_bsz.")
+    data._add_item("packed_length", data.seq_len * data.micro_bsz)
+    if "micro_num" not in data:
+        data._add_item("micro_num", 1)
+    data._add_item("gradient_accumulation", data.micro_num)
+    if gpc.is_rank_for_log():
+        logger.info(f"gradient_accumulation size will be setted to {data.micro_num}.")
+    # batch_size should be equal with micro_num, should not use it directly
+    data._add_item("batch_size", data.micro_num)
+    if "min_length" not in data:
+        data._add_item("min_length", 0)
+    if "train_folder" not in data:
+        data._add_item("train_folder", None)
+    if "valid_folder" not in data:
+        data._add_item("valid_folder", None)
+    if "valid_micro_num" not in data:
+        data._add_item("valid_micro_num", data.micro_num)
+    if "valid_every" not in data:
+        data._add_item("valid_every", 0)
+    if "empty_cache_and_diag_interval" not in data:
+        data._add_item("empty_cache_and_diag_interval", 50)
+    if "diag_outlier_ratio" not in data:
+        data._add_item("diag_outlier_ratio", 1.1)
+    data.diag_outlier_ratio = max(1, data.diag_outlier_ratio)
+    if gpc.is_rank_for_log():
+        logger.info("+" * 15 + " Data Info " + "+" * 15)  # pylint: disable=W1201
+        logger.info(f"seq_len: {data.seq_len}")
+        logger.info(f"micro_num: {data.micro_num}")
+        logger.info(f"micro_bsz: {data.micro_bsz}")
+        logger.info(f"packed_length: {data.packed_length}")
+        logger.info(f"pack_sample_into_one: {data.pack_sample_into_one}")
+        logger.info(f"min_length: {data.min_length}")
+        logger.info(f"valid_micro_num: {data.valid_micro_num}")
+        logger.info(f"valid_every: {data.valid_every}")
+    # processing the checkpoint config
+    ckpt = gpc.config.ckpt
+    if "enable_save_ckpt" not in ckpt:
+        ckpt._add_item("enable_save_ckpt", True)
+    # Saving checkpoint args.
+    if ckpt.enable_save_ckpt:
+        assert "checkpoint_every" in ckpt, "If enable save checkpoint, must give checkpoint_every in config.data!"
+        assert ckpt.checkpoint_every > 0
+        assert "save_ckpt_folder" in ckpt, "If enable save checkpoint, must give save_ckpt_folder in config.data!"
+        if "async_upload" not in ckpt:
+            ckpt._add_item("async_upload", False)  # async defalut is False.
+        else:
+            if ckpt.async_upload:
+                assert "save_ckpt_folder" in ckpt
+                if "boto3:" not in ckpt.save_ckpt_folder:
+                    if gpc.is_rank_for_log():
+                        logger.warning(
+                            "Storing ckpt on file system does not support asynchronous storage, will use sync save!"
+                        )
+                    ckpt.async_upload = False
+                else:
+                    if "async_upload_tmp_folder" not in ckpt:
+                        ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
+        if not ckpt.async_upload:
+            ckpt._add_item("async_upload_tmp_folder", None)
+        if "oss_snapshot_freq" not in ckpt:
+            ckpt._add_item("oss_snapshot_freq", float("inf"))  # if oss_snapshot_freq not given, we disable.
+    else:
+        ckpt._add_item("checkpoint_every", float("inf"))
+        ckpt._add_item("oss_snapshot_freq", float("inf"))
+        ckpt._add_item("save_ckpt_folder", None)
+        ckpt._add_item("async_upload", False)
+        ckpt._add_item("async_upload_tmp_folder", None)
+        ckpt._add_item("snapshot_ckpt_folder", None)
+    if "load_ckpt_folder" not in ckpt:
+        ckpt._add_item("load_ckpt_folder", None)
+    if "stop_file_path" not in ckpt:
+        ckpt._add_item("stop_file_path", None)
+    if "auto_resume" not in ckpt:
+        # If 'auto_resume' is not given, we set it to True, so internlm can have opportunity
+        # to auto-load latest checkpoint.
+        ckpt._add_item("auto_resume", True)
+    if gpc.is_rank_for_log():
+        logger.info("+" * 15 + " Ckpt Info " + "+" * 15)  # pylint: disable=W1201
+        logger.info(f"is enable save ckpt: {ckpt.enable_save_ckpt}")
+        logger.info(f"save_ckpt_folder: {ckpt.save_ckpt_folder}")
+        logger.info(f"checkpoint_every: {ckpt.checkpoint_every}")
+    # tensorboard writer config
+    if "enable_tb" not in gpc.config:
+        gpc.config._add_item("enable_tb", True)
+    if "tensorboard_folder" not in gpc.config:
+        gpc.config._add_item(
+            "tensorboard_folder", os.environ["tensorboard_folder"] if "tensorboard_folder" in os.environ else None
+        )
+    if "resume_tb_folder" not in gpc.config:
+        gpc.config._add_item(
+            "resume_tb_folder", os.environ["resume_tb_folder"] if "resume_tb_folder" in os.environ else None
+        )
+    if gpc.is_rank_for_log():
+        logger.info(f"tensorboard_folder: {gpc.config.tensorboard_folder}")
+        logger.info(f"resume_tb_folder: {gpc.config.resume_tb_folder}")
+    # cudnn
+    torch.backends.cudnn.benchmark = gpc.config.get("cudnn_benchmark", False)
+    torch.backends.cudnn.deterministic = gpc.config.get("cudnn_deterministic", False)
+    clip_grad_norm = gpc.config.hybrid_zero_optimizer.get("clip_grad_norm", 0.0)
+    if gpc.is_rank_for_log():
+        logger.info("+" * 15 + " Other Info " + "+" * 15)  # pylint: disable=W1201
+        logger.info(f"cudnn.benchmark: {torch.backends.cudnn.benchmark }")
+        logger.info(f"cudnn.deterministic: {torch.backends.cudnn.deterministic }")
+        logger.info(f"clip_grad_norm: {clip_grad_norm}")
+    model = gpc.config.model
+    if "dtype" not in model:
+        logger.warning("dtype is not set, use torch.float16 by defalut!")
+        model._add_item("dtype", torch.float16)
+    else:
+        if gpc.config.model.dtype == "torch.bfloat16":
+            gpc.config.model.dtype = torch.bfloat16
+        elif gpc.config.model.dtype in ("torch.float16", "torch.half"):
+            gpc.config.model.dtype = torch.float16
+        elif gpc.config.model.dtype == "torch.float32":
+            gpc.config.model.dtype = torch.float32
+        elif gpc.config.model.dtype == "torch.tf32":
+            torch.backends.cudnn.allow_tf32 = True
+            torch.backends.cuda.matmul.allow_tf32 = True
+            gpc.config.model.dtype = torch.float32
+        else:
+            assert gpc.config.model.dtype in [
+                "torch.float16",
+                "torch.half",
+                "torch.bfloat16",
+                "torch.float32",
+                "torch.tf32",
+            ]
+    if "checkpoint" in model:
+        if model.checkpoint is True:
+            model.checkpoint = 1
+        elif model.checkpoint is False:
+            model.checkpoint = 0
+        else:
+            assert (
+                model.checkpoint >= 0 and model.checkpoint <= 1
+            ), f'model.checkpoint: "{model.checkpoint}" should >=0 and <=1'
+    if "teacher" in gpc.config:
+        teacher = gpc.config.teacher
+        if "dtype" not in teacher:
+            logger.warning("dtype is not set, use torch.float16 by defalut!")
+            teacher._add_item("dtype", torch.float16)
+        else:
+            if gpc.config.teacher.dtype == "torch.bfloat16":
+                gpc.config.teacher.dtype = torch.bfloat16
+            elif gpc.config.teacher.dtype in ("torch.float16", "torch.half"):
+                gpc.config.teacher.dtype = torch.float16
+            elif gpc.config.teacher.dtype == "torch.float32":
+                gpc.config.teacher.dtype = torch.float32
+            elif gpc.config.teacher.dtype == "torch.tf32":
+                torch.backends.cudnn.allow_tf32 = True
+                torch.backends.cuda.matmul.allow_tf32 = True
+                gpc.config.teacher.dtype = torch.float32
+            else:
+                assert gpc.config.teacher.dtype in [
+                    "torch.float16",
+                    "torch.half",
+                    "torch.bfloat16",
+                    "torch.float32",
+                    "torch.tf32",
+                ]
+        if "checkpoint" in teacher:
+            if teacher.checkpoint is True:
+                teacher.checkpoint = 1
+            elif teacher.checkpoint is False:
+                teacher.checkpoint = 0
+            else:
+                assert (
+                        teacher.checkpoint >= 0 and teacher.checkpoint <= 1
+                ), f'teacher.checkpoint: "{teacher.checkpoint}" should >=0 and <=1'
+    if gpc.is_rank_for_log():
+        logger.info("+" * 15 + " Model Info " + "+" * 15)  # pylint: disable=W1201
+        logger.info(f"Model: {gpc.config.model}")
+        logger.info("+" * 15 + " grad_scaler Info " + "+" * 15)  # pylint: disable=W1201
+        logger.info(f"grad_scaler: {gpc.config.grad_scaler}")
+        logger.info("+" * 15 + " hybrid_zero_optimizer Info " + "+" * 15)  # pylint: disable=W1201
+        logger.info(f"hybrid_zero_optimizer: {gpc.config.hybrid_zero_optimizer}")
+        logger.info("+" * 15 + " adam Info " + "+" * 15)  # pylint: disable=W1201
+        logger.info(f"adam: {gpc.config.adam}")
+        logger.info("+" * 15 + " beta2_scheduler Info " + "+" * 15)  # pylint: disable=W1201
+        logger.info(f"beta2_scheduler: {gpc.config.beta2_scheduler}")
+    # process the model config
+    if "use_flash_attn" not in gpc.config.model:
+        gpc.config.model._add_item("use_flash_attn", True)
+    # process the parallel config
+    if "sequence_parallel" not in gpc.config.parallel:
+        gpc.config.parallel._add_item("sequence_parallel", False)
+    else:
+        assert not (
+            gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
+        ), "sequence parallel does not support use_flash_attn=False"
+    # monitoring default config
+    monitor_default_config = {
+        "alert_address": None,  # compatible with old alert config
+        "monitor": {  # new monitoring config
+            "alert": {"enable_feishu_alert": False, "feishu_alert_address": None, "light_monitor_address": None}
+        },
+    }
+    for key, value in monitor_default_config.items():
+        if key not in gpc.config:
+            gpc.config._add_item(key, value)
+    alert = gpc.config.monitor.alert
+    if alert.enable_feishu_alert and not alert.feishu_alert_address and gpc.is_rank_for_log():
+        logger.warning("alert is enable but alert_address is not set")
+    optim_ckpt = gpc.config.hybrid_zero_optimizer
+    if "zero_overlap_communication" in optim_ckpt:
+        # Compatible with the old interfaces.
+        optim_ckpt._add_item("overlap_sync_grad", optim_ckpt.zero_overlap_communication)
+    if "overlap_sync_grad" not in optim_ckpt:
+        optim_ckpt._add_item("overlap_sync_grad", False)
+    if "overlap_sync_param" not in optim_ckpt:
+        optim_ckpt._add_item("overlap_sync_param", False)
+    if gpc.is_rank_for_log():
+        logger.info(
+            f"overlap_sync_grad:{optim_ckpt.overlap_sync_grad}, overlap_sync_param:{optim_ckpt.overlap_sync_param}"
+        )
+def launch(
+    config: Union[str, Path, Config, Dict],
+    rank: int,
+    world_size: int,
+    host: str,
+    port: int,
+    backend: str = "nccl",
+    local_rank: int = None,
+    seed: int = 1024,
+):
+    """This function first parses the configuration arguments, using :func:`parse_args()` in case one of the input
+    arguments are not given. Then initialize and set distributed environment by calling global_context's functions.
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        rank (int): Rank for the default process group
+        world_size (int): World size of the default process group
+        host (str): The master address for distributed training
+        port (str): The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        local_rank (int, optional):
+            Rank for the process on the node and is used to set the default CUDA device,
+            defaults to None. If local_rank = None, the default device ordinal will be calculated automatically.
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+    Raises:
+        Exception: Raise exception when config type is wrong
+    """
+    # set config
+    assert isinstance(
+        config, (Config, str, Path, dict)
+    ), f"expected argument config to be Config, str or Path, but got {type(config)}"
+    if not isinstance(config, Config) and isinstance(config, dict):
+        config = Config(config)
+    if isinstance(config, (str, Path)):
+        config = Config.from_file(config)
+    gpc.load_config(config)
+    # init default process group
+    gpc.init_global_dist(rank, world_size, backend, host, port)
+    # init process groups for different parallel modes from config
+    gpc.init_parallel_groups()
+    # set cuda device
+    if torch.cuda.is_available():
+        # if local rank is not given, calculate automatically
+        gpc.set_device(local_rank)
+    # set the number of processes running on the same node
+    gpc.detect_num_processes_on_current_node()
+    gpc.set_seed(seed)
+    if gpc.is_rank_for_log():
+        logger.info(
+            f"Distributed environment is initialized, "
+            f"data parallel size: {gpc.data_parallel_size}, pipeline parallel size: {gpc.pipeline_parallel_size}, "
+            f"tensor parallel size: {gpc.tensor_parallel_size}",
+        )
+def launch_from_slurm(
+    config: Union[str, Path, Config, Dict],
+    host: str,
+    port: int,
+    backend: str = "nccl",
+    seed: int = 1024,
+):
+    """A wrapper for internlm.launch for SLURM launcher by reading rank and world size from the environment variables
+    set by SLURM
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        host (str): The master address for distributed training
+        port (str): The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+    """
+    try:
+        rank = int(os.environ["SLURM_PROCID"])
+        world_size = int(os.environ["SLURM_NPROCS"])
+    except KeyError as e:
+        raise RuntimeError(f"Could not find {e} in the SLURM environment")
+    launch(
+        config=config,
+        rank=rank,
+        world_size=world_size,
+        host=host,
+        port=port,
+        backend=backend,
+        seed=seed,
+    )
+def launch_from_torch(
+    config: Union[str, Path, Config, Dict],
+    backend: str = "nccl",
+    seed: int = 1024,
+):
+    """A wrapper for internlm.launch for torchrun or torch.distributed.launch by reading rank and world size
+    from the environment variables set by PyTorch
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+    """
+    try:
+        rank = int(os.environ["RANK"])
+        local_rank = int(os.environ["LOCAL_RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        host = os.environ["MASTER_ADDR"]
+        port = int(os.environ["MASTER_PORT"])
+    except KeyError as e:
+        raise RuntimeError(f"Could not find {e} in the torch environment")
+    launch(
+        config=config,
+        local_rank=local_rank,
+        rank=rank,
+        world_size=world_size,
+        host=host,
+        port=port,
+        backend=backend,
+        seed=seed,
+    )
+@llm_timeout(func_name="initialize_distributed_env")
+def initialize_distributed_env(
+    config: str,
+    launcher: str = "slurm",
+    master_port: int = 8888,
+    seed: int = 1024,
+    args_check=True,
+):
+    """
+    Initialize distributed environment for distributed training.
+    Args:
+        config (str): Config file path.
+        launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
+        master_port (str): The master port for distributed training. 8888 by default.
+        seed (int, optional): Specified random seed for every process. 1024 by default.
+    """
+    torch.cuda.empty_cache()
+    if launcher == "torch":
+        launch_from_torch(config=config, seed=seed)
+    elif launcher == "slurm":
+        launch_from_slurm(
+            config=config,
+            host=get_master_node(),
+            port=master_port,
+            seed=seed,
+        )
+    else:
+        assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
+    if args_check:
+        args_sanity_check()
+    # init light monitor client
+    alert_config = gpc.config.monitor.alert
+    if alert_config.enable_feishu_alert and gpc.is_rank_for_log():
+        light_monitor_address = alert_config.light_monitor_address
+        if light_monitor_address:
+            initialize_light_monitor(light_monitor_address)
+        else:
+            logger.warning("monitor address is none, monitor could not be used!")
+def get_config_value(config, key, defalut):
+    try:
+        value = config[key]
+    except KeyError:
+        value = defalut
+    return value

InternLM/internlm/initialize/legacy/__init__.py ADDED Viewed

File without changes

InternLM/internlm/initialize/legacy/launch.py ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from internlm.initialize.launch import get_config_value
+from internlm.utils.logger import get_logger
+logger = get_logger(__file__)
+def auto_resume_sanity_check(ckpt_config):
+    load_given_ckpt = get_config_value(ckpt_config, "load_given_ckpt", None)
+    if load_given_ckpt is None:
+        return True  # default value is True
+    else:
+        return not load_given_ckpt
+def ckpt_info_sanity_check(ckpt_config):
+    load_ckpt_folder = get_config_value(ckpt_config, "load_ckpt_folder", None)
+    load_model_only_folder = get_config_value(ckpt_config, "load_model_only_folder", None)
+    if load_model_only_folder is not None:
+        assert (
+            load_ckpt_folder is None
+        ), "Detect 'load_ckpt_folder' and 'load_model_only_folder' set at the same time, \
+# and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
+        return dict(path=load_model_only_folder, content=("model",), ckpt_type="internlm")
+    else:
+        load_optimizer = get_config_value(ckpt_config, "load_optimizer", True)
+        if isinstance(load_ckpt_folder, str):
+            if load_optimizer:
+                return dict(path=load_ckpt_folder, content=("model", "sampler", "optimizer"), ckpt_type="internlm")
+            else:
+                return dict(path=load_ckpt_folder, content=("model", "sampler"), ckpt_type="internlm")
+        elif load_ckpt_folder is None:
+            return None
+        else:
+            assert f"Unsupport data type:'{type(load_ckpt_folder)}' for config.ckpt arg: 'load_ckpt_folder'"

InternLM/internlm/model/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from .embedding import Embedding1D, RotaryEmbedding
+from .linear import FeedForward, RewardModelLinear, ScaleColumnParallelLinear
+from .metrics import AccPerplex
+from .modeling_internlm import build_model_with_cfg
+from .modeling_vit import build_vit_model_with_cfg
+from .multi_head_attention import MHA
+from .utils import gather_forward_split_backward
+__all__ = [
+    "Embedding1D",
+    "FeedForward",
+    "RotaryEmbedding",
+    "RewardModelLinear",
+    "ScaleColumnParallelLinear",
+    "AccPerplex",
+    "MHA",
+    "gather_forward_split_backward",
+    "build_model_with_cfg",
+    "build_vit_model_with_cfg"
+]

InternLM/internlm/model/embedding.py ADDED Viewed

	@@ -0,0 +1,273 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from typing import Tuple
+import rotary_emb
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from flash_attn.layers.rotary import ApplyRotaryEmb as LegacyApplyRotaryEmb
+from flash_attn.layers.rotary import ApplyRotaryEmbQKV_ as LegacyApplyRotaryEmbQKV_
+from torch import Tensor, nn
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+from .utils import gather_forward_split_backward, split_forward_gather_backward
+from .muse import VQGANModel
+class Embedding1DLVM(nn.Module):
+    def __init__(
+        self,
+        vq_model_path: str,
+        embedding_dim: int = None,
+        freeze_vq_model: bool = True
+    ):
+        super().__init__()
+        self.vq_model = VQGANModel.from_pretrained(vq_model_path)
+        if freeze_vq_model:
+            self.vq_model.requires_grad_(False)
+            self.vq_model.eval()
+        self.num_embeddings, vq_embed_dim = self.vq_model.quantize.embedding.weight.shape
+        if embedding_dim is not None:
+            self.embed_proj = nn.Linear(vq_embed_dim, embedding_dim, bias=False)
+            self.embedding_dim = embedding_dim
+        else:
+            self.embed_proj = None
+            self.embedding_dim = vq_embed_dim
+    def forward(self, input_: Tensor) -> Tensor:
+        # input: N x seq
+        output_parallel = self.vq_model.quantize.get_codebook_entry_for_lvm(input_)  # N x vq_embed_dim x sqrt(seq) x sqrt(seq)
+        if self.embed_proj is not None:
+            output_parallel = self.embed_proj(output_parallel)
+        output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1)
+        if gpc.config.parallel.sequence_parallel:
+            output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)
+        return output
+class Embedding1D(nn.Module):
+    """
+    1D Embedding.
+    Args:
+        num_embeddings (int): The size of vocab.
+        embedding_dim (int): The dimention of model.
+        padding_idx (int): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
+                            therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
+                            i.e. it remains as a fixed "pad". None by default.
+        dtype (Optional[torch.dtype]): Data type None by default.
+    """
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        *args,
+        padding_idx: int = None,
+        dtype: torch.dtype = None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embed_dim = embedding_dim
+        embed_dim_per_partition = embedding_dim // gpc.tensor_parallel_size
+        self.padding_idx = padding_idx
+        self.embed_args = args
+        self.embed_kwargs = kwargs
+        self.weight = nn.Parameter(torch.empty((num_embeddings, embed_dim_per_partition), dtype=dtype))
+    def forward(self, input_: Tensor) -> Tensor:
+        output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
+        output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1)
+        if gpc.config.parallel.sequence_parallel:
+            output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)
+        return output
+class ApplyRotaryEmbQKV_(torch.autograd.Function):
+    """
+    ApplyRotaryEmbQKV_
+    """
+    @staticmethod
+    def forward(ctx, qkv, cos, sin, cos_k=None, sin_k=None):
+        """
+            qkv: (total, 3, nheads, headdim)
+            cos, sin: (seqlen, rotary_dim / 2)
+            cos_k, sin_k: (seqlen, rotary_dim / 2), optional
+        rotary_dim must be <= headdim
+        Apply rotary embedding *inplace* to the first rotary_dim of q and k.
+        """
+        _, three, _, headdim = qkv.shape
+        assert three == 3
+        rotary_seqlen, rotary_dim = cos.shape
+        rotary_dim *= 2
+        assert rotary_dim <= headdim
+        cos_k = cos if cos_k is None else cos_k
+        sin_k = sin if sin_k is None else sin_k
+        assert sin.shape == cos_k.shape == sin_k.shape == (rotary_seqlen, rotary_dim // 2)
+        q1, q2 = qkv[:, 0, :, :rotary_dim].chunk(2, dim=-1)
+        rotary_emb.apply_rotary(q1, q2, rearrange(cos, "s d -> s 1 d"), rearrange(sin, "s d -> s 1 d"), q1, q2, False)
+        k1, k2 = qkv[:, 1, :, :rotary_dim].chunk(2, dim=-1)
+        rotary_emb.apply_rotary(
+            k1, k2, rearrange(cos_k, "s d -> s 1 d"), rearrange(sin_k, "s d -> s 1 d"), k1, k2, False
+        )
+        ctx.save_for_backward(cos, sin, cos_k, sin_k)
+        return qkv
+    @staticmethod
+    def backward(ctx, dqkv):
+        cos, sin, cos_k, sin_k = ctx.saved_tensors
+        rotary_dim = cos.shape[-1]
+        rotary_dim *= 2
+        dq1, dq2 = dqkv[:, 0, :, :rotary_dim].chunk(2, dim=-1)
+        rotary_emb.apply_rotary(
+            dq1, dq2, rearrange(cos, "s d -> s 1 d"), rearrange(sin, "s d -> s 1 d"), dq1, dq2, True
+        )
+        dk1, dk2 = dqkv[:, 1, :, :rotary_dim].chunk(2, dim=-1)
+        rotary_emb.apply_rotary(
+            dk1, dk2, rearrange(cos_k, "s d -> s 1 d"), rearrange(sin_k, "s d -> s 1 d"), dk1, dk2, True
+        )
+        return dqkv, None, None, None, None
+apply_rotary_emb_qkv_ = ApplyRotaryEmbQKV_.apply
+legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply
+legacy_apply_rotary_embed = LegacyApplyRotaryEmb.apply
+class RotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    If scale_base > 0, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
+    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
+    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
+    """
+    def __init__(self, dim: int, base=10000, scale_base=0, device=None):
+        """ """
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
+        self.scale_base = scale_base
+        self.scale = (
+            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
+            if scale_base > 0
+            else None
+        )
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+    def _update_cos_sin_cache(self, x, indexes):
+        """x: (batch, seqlen, nheads, headdim) or (batch, seqlen, 3, nheads, headdim)"""
+        if not isinstance(indexes, int):
+            seqlen = indexes.max().item() + 1
+        else:
+            seqlen = indexes + 1  # eval_forward
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if seqlen > self._seq_len_cached or self._cos_cached.device != x.device or self._cos_cached.dtype != x.dtype:
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=x.device, dtype=self.inv_freq.dtype)
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            if self.scale is None:
+                self._cos_cached = torch.cos(freqs).to(x.dtype)
+                self._sin_cached = torch.sin(freqs).to(x.dtype)
+            else:
+                power = (
+                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
+                ) / self.scale_base
+                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
+                # We want the multiplication by scale to happen in fp32
+                self._cos_cached = (torch.cos(freqs) * scale).to(x.dtype)
+                self._sin_cached = (torch.sin(freqs) * scale).to(x.dtype)
+                self._cos_k_cached = (torch.cos(freqs) / scale).to(x.dtype)
+                self._sin_k_cached = (torch.sin(freqs) / scale).to(x.dtype)
+    def forward(self, qkv: torch.Tensor, **kwargs):
+        if kwargs.get("indexes", None) is not None:
+            return self._forward(qkv, kwargs.pop("indexes"))
+        if kwargs.get("inference_params", None) is not None:
+            return self._eval_forward(qkv, seqlen_offset=kwargs.get("inference_params", None).sequence_len_offset)
+        else:
+            return self._eval_forward(qkv)
+    def _forward(self, qkv: torch.Tensor, indexes=0) -> Tuple[torch.Tensor, torch.Tensor]:
+        self._update_cos_sin_cache(qkv, indexes)
+        if self.scale is None:
+            return apply_rotary_emb_qkv_(qkv, self._cos_cached[indexes], self._sin_cached[indexes])
+        else:
+            return apply_rotary_emb_qkv_(
+                qkv,
+                self._cos_cached[indexes],
+                self._sin_cached[indexes],
+                self._cos_k_cached[indexes],
+                self._sin_k_cached[indexes],
+            )
+    def _eval_forward(self, qkv, seqlen_offset=0):
+        """
+        seqlen_offset: can be used in generation where the qkv being passed in is only the last
+        token in the batch.
+        """
+        self._update_cos_sin_cache(qkv, seqlen_offset + qkv.shape[1])
+        if self.scale is None:
+            return legacy_apply_rotary_embed_qkv(
+                qkv, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:]
+            )
+        else:
+            return legacy_apply_rotary_embed_qkv(
+                qkv,
+                self._cos_cached[seqlen_offset:],
+                self._sin_cached[seqlen_offset:],
+                self._cos_k_cached[seqlen_offset:],
+                self._sin_k_cached[seqlen_offset:],
+            )
+    def _single_forward(self, x, indexes=0):
+        assert self.scale is None
+        self._update_cos_sin_cache(x, indexes)
+        x = x[None, ...]
+        ret = legacy_apply_rotary_embed(x, self._cos_cached[indexes], self._sin_cached[indexes]).squeeze(0)
+        return ret
+    def _single_eval_forward(self, x, seqlen_offset=0):
+        assert self.scale is None
+        self._update_cos_sin_cache(x, seqlen_offset + x.shape[1])
+        return legacy_apply_rotary_embed(x, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:])

InternLM/internlm/model/linear.py ADDED Viewed

	@@ -0,0 +1,201 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
+from flash_attn.utils.distributed import all_reduce, reduce_scatter
+from torch import nn
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.model.utils import fused_dense_func_torch
+class ScaleColumnParallelLinear(nn.Linear):
+    """
+    ScaleColumnParallelLinear.
+    Args:
+        in_features (int): size of each input sample
+        out_features (int): size of each output sample
+        process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`.
+        bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False
+                    in the config.
+        sequence_parallel (bool): If sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+                                    we do an all_gather of x before doing the matmul.
+                                    If not, then the input is already gathered.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        dtype (Optional[torch.dtype]): The type of data.
+        weight_scale (int): For training stability. 1 by default.
+    """
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        process_group: Optional[torch.distributed.ProcessGroup],
+        bias: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        weight_scale: int = 1,
+    ) -> None:
+        world_size = torch.distributed.get_world_size(process_group)
+        if out_features % world_size != 0:
+            raise ValueError(f"out_features ({out_features}) must be divisible by " f"world_size ({world_size})")
+        super().__init__(in_features, out_features // world_size, bias=bias, device=device, dtype=dtype)
+        self.process_group = process_group
+        self.weight_scale = weight_scale
+    def forward(self, input):  # pylint: disable=W0622
+        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+        # we do an all_gather of x before doing the matmul.
+        # If not, then the input is already gathered.
+        if self.weight_scale != 1:
+            weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
+        else:
+            weight = self.weight
+        return fused_dense_func_torch(
+            input,
+            weight,
+            self.bias,
+            process_group=self.process_group,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+        )
+class RewardModelLinear(ScaleColumnParallelLinear):
+    """
+    RewardModelLinear.
+    Args:
+        in_features (int): size of each input sample
+        out_features (int): size of each output sample
+        process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`.
+        bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False
+                    in the config.
+        sequence_parallel (bool): If sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+                                    we do an all_gather of x before doing the matmul.
+                                    If not, then the input is already gathered.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        dtype (Optional[torch.dtype]): The type of data.
+        weight_scale (int): For training stability. 1 by default.
+    """
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        process_group: Optional[torch.distributed.ProcessGroup],
+        bias: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        weight_scale: int = 1,
+    ) -> None:
+        super().__init__(in_features, out_features, process_group, bias, device, dtype, weight_scale)
+        torch.distributed.broadcast(self.weight, gpc.get_ranks_in_group(ParallelMode.TENSOR)[0], process_group)
+        if bias:
+            torch.distributed.broadcast(self.bias, gpc.get_ranks_in_group(ParallelMode.TENSOR)[0], process_group)
+    def forward(self, input):  # pylint: disable=W0622
+        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+        # we do an all_gather of x before doing the matmul.
+        # If not, then the input is already gathered.
+        if self.weight_scale != 1:
+            weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
+        else:
+            weight = self.weight
+        return fused_dense_func_torch(
+            input,
+            weight,
+            self.bias,
+            process_group=self.process_group,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+        )
+class ColumnParallelLinearTorch(ColumnParallelLinear):
+    def forward(self, x):
+        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+        # we do an all_gather of x before doing the matmul.
+        # If not, then the input is already gathered.
+        return fused_dense_func_torch(
+            x, self.weight, self.bias, process_group=self.process_group, sequence_parallel=self.sequence_parallel
+        )
+class RowParallelLinearTorch(RowParallelLinear):
+    def forward(self, x):
+        """
+        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
+        a reduce_scatter of the result.
+        """
+        out = fused_dense_func_torch(x, self.weight, self.bias)
+        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+        return reduce_fn(out, self.process_group)
+class FeedForward(nn.Module):
+    """
+    FeedForward.
+    Args:
+        in_features (int): size of each input sample
+        hidden_features (int): size of hidden state of FFN
+        out_features (int): size of each output sample
+        process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`.
+        bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False
+                    in the config.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        dtype (Optional[torch.dtype]): The type of data.
+        multiple_of (int): For efficient training. Reset the size of hidden feature. 256 by default.
+    """
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        out_features: int = None,
+        process_group: Optional[torch.distributed.ProcessGroup] = None,
+        bias: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        multiple_of: int = 256,
+    ):
+        super().__init__()
+        hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of)
+        self.w1 = ColumnParallelLinearTorch(
+            in_features,
+            hidden_features,
+            process_group,
+            bias,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            device=device,
+            dtype=dtype,
+        )
+        self.w2 = ColumnParallelLinearTorch(
+            in_features,
+            hidden_features,
+            process_group,
+            bias,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            device=device,
+            dtype=dtype,
+        )
+        self.w3 = RowParallelLinearTorch(
+            hidden_features,
+            out_features,
+            process_group,
+            bias=bias,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            device=device,
+            dtype=dtype,
+        )
+    def forward(self, x):
+        out = self.w3(F.silu(self.w1(x)) * self.w2(x))
+        return out

InternLM/internlm/model/loss.py ADDED Viewed

	@@ -0,0 +1,81 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import torch.nn.functional as F
+from flash_attn.losses.cross_entropy import CrossEntropyLoss as FlashCrossEntropyLoss
+from torch import nn
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+class FlashGPTLMLoss(nn.Module):
+    """
+    Loss function for flash GPT Language Model.
+    """
+    def __init__(self, parallel_output=True, label_smoothing=0):
+        super().__init__()
+        if label_smoothing is not None:
+            if label_smoothing != 0:
+                if gpc.is_rank_for_log():
+                    print(f"use label_smoothing: {label_smoothing}")
+        else:
+            label_smoothing = 0
+        self.label_smoothing = label_smoothing
+        if parallel_output:
+            self.loss_fn = FlashCrossEntropyLoss(
+                reduction="mean",
+                inplace_backward=True,
+                process_group=gpc.get_group(ParallelMode.TENSOR),
+                label_smoothing=label_smoothing,
+            )  # The loss in this place is bound to the gather_output initialized by VocabParallelClassifier1D
+        else:
+            # Here, the output will gather output is set in the model, so use ordinary loss
+            self.loss_fn = nn.CrossEntropyLoss(reduction="mean", label_smoothing=label_smoothing)
+    def forward(self, *args):
+        if len(args) == 3:
+            # residual is to match prenorm
+            logits, _, labels = args
+        elif len(args) == 2:
+            # When using postnorm
+            logits, labels = args
+        else:
+            raise RuntimeError(f"The number of criterion inputs are:{len(args)}")
+        shift_logits = logits.contiguous().view(-1, logits.size(-1))
+        shift_labels = labels.contiguous().view(-1)
+        loss = self.loss_fn(
+            shift_logits, shift_labels
+        )  # There is no need to consider the ignore_index problem here, because the loss calculation will be
+        # calculated through the calculation range, and -100 must be outside this range, so there is no problem
+        return loss
+class KLDivLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.temperature = gpc.config.kd_config.get('temperature', 1)
+        self.inverse = gpc.config.kd_config.get('inverse', False)
+    def forward(self, *args):
+        if len(args) == 3:
+            if self.inverse:
+                logits_teacher, logits_student, _ = args
+            else:
+                logits_student, logits_teacher, _ = args
+        else:
+            raise RuntimeError(f"The number of criterion inputs are:{len(args)}")
+        logits_teacher = logits_teacher.contiguous().view(-1, logits_teacher.size(-1))
+        logits_student = logits_student.contiguous().view(-1, logits_student.size(-1))
+        log_pred_student = F.log_softmax(logits_student / self.temperature, dim=1)
+        pred_teacher = F.softmax(logits_teacher / self.temperature, dim=1)
+        loss_kd = F.kl_div(log_pred_student, pred_teacher, reduction='batchmean')
+        loss_kd *= self.temperature ** 2
+        return loss_kd

InternLM/internlm/model/metrics.py ADDED Viewed

	@@ -0,0 +1,263 @@

+from typing import List
+import torch
+from flash_attn.losses.cross_entropy import CrossEntropyLoss as FlashCrossEntropyLoss
+from torch_scatter import scatter
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.utils.parallel import is_no_pp_or_last_stage
+class AccPerplex:
+    """
+    AccPerplex module for calculating model's accuracy and perplexity metrics.
+    Args:
+        device: The GPU device.
+        tp_pg: The tensor parallel process group.
+        dp_pg: The data parallel process group.
+        tokenizer: For calculating BPB.
+        dataset_types (List[str]): Various data types that will be used in the current training process,
+            such as ['en', 'cn', 'code']. The order of the List should be consistent with the type_id specified
+            in the dataset. Changed parameters need to be used in conjunction with set_current_type_ids().
+    """
+    def __init__(self, device, tp_pg, dp_pg, tokenizer=None, dataset_types: List[str] = None):
+        self.device = device
+        self.right = torch.Tensor([0]).to(device=device)
+        self.total = torch.Tensor([0]).to(device=device)
+        self.total_log_probs = torch.Tensor([0]).to(device=device)
+        self.tp_pg = tp_pg
+        self.dp_pg = dp_pg
+        self.tp_local_rank = torch.distributed.get_rank(self.tp_pg)
+        self.tokenizer = tokenizer
+        self.total_bytes = torch.Tensor([0]).to(device=device).view(1)
+        self.batch_shift = 0
+        self.type_ids = None
+        if dataset_types is not None:
+            self.dataset_types = dataset_types
+            self.total_type_count = len(dataset_types)
+            self.ds_right = torch.zeros(self.total_type_count, dtype=torch.long, device=device)
+            self.ds_tokens = torch.zeros(self.total_type_count, dtype=torch.long, device=device)
+        self.loss_with_type_id = LossWithTypeId(device, dp_pg, dataset_types)
+    def set_current_type_ids(self, type_ids: torch.Tensor):
+        self.batch_shift = 0
+        self.type_ids = type_ids.cuda()
+    def __call__(self, logits, labels):
+        return self.update(logits, labels, type_ids=self.type_ids)
+    def update(self, logits, labels, type_ids=None):
+        if gpc.config.model.use_flash_attn:
+            micro_bsz = labels.size(0)
+        else:
+            micro_bsz = 1
+        if type_ids is not None:
+            type_ids = type_ids[self.batch_shift * micro_bsz : (self.batch_shift + 1) * micro_bsz].view(-1)
+            self.batch_shift += 1
+        self.loss_with_type_id.update(logits, labels, type_ids)
+        with torch.no_grad():
+            if isinstance(logits, (list, tuple)):
+                logits = logits[0]
+            logits = logits.detach().clone()
+            labels = labels.detach().clone()
+            if self.tokenizer:  # need to calculate bits per bytes
+                sequences = self.tokenizer.decode_ids(labels.tolist())
+                self.total_bytes += sum(map(lambda x: len(x.encode("utf-8")), sequences))
+            shift_logits = logits.view(-1, logits.size(-1))
+            shift_labels = labels.view(-1)
+            # There is a shift according to the current rank, because the logits are split
+            pred_shift = self.tp_local_rank * logits.shape[-1]
+            logits_max = torch.max(shift_logits, dim=-1)[0]
+            torch.distributed.all_reduce(logits_max, op=torch.distributed.ReduceOp.MAX, group=self.tp_pg)
+            # Determine whether the maximum value of the current local tensor is the global maximum value
+            logits_global = logits_max == torch.max(shift_logits, dim=-1)[0]
+            corrects = torch.logical_and(
+                (shift_labels == (shift_logits.argmax(dim=-1) + pred_shift)), logits_global
+            ).long()
+            mask = shift_labels.ne(-100).long()
+            if hasattr(self, "total_type_count"):
+                ds_acc = scatter(corrects, type_ids, dim=0, reduce="sum")
+                token_num_type = scatter(mask, type_ids, dim=0, reduce="sum")
+                if len(ds_acc) < self.total_type_count:
+                    ds_acc = torch.cat([ds_acc, ds_acc.new_zeros(self.total_type_count - len(ds_acc))])
+                    token_num_type = torch.cat(
+                        [token_num_type, token_num_type.new_zeros(self.total_type_count - len(token_num_type))]
+                    )
+                self.ds_tokens += token_num_type
+                sync_tensor = ds_acc
+                torch.distributed.all_reduce(sync_tensor, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)
+                self.ds_right += sync_tensor.view(-1)
+            acc = corrects.sum()
+            torch.distributed.all_reduce(acc, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)
+            self.right += acc  # Masked_fill is not needed here because -100 is not available anyway
+            self.total += mask.sum()
+            # Subtract the maximum value.
+            shift_logits = shift_logits.sub(logits_max.unsqueeze(dim=-1))
+            # Get the partition's vocab indecies
+            partition_vocab_size = shift_logits.size()[-1]
+            vocab_start_index = partition_vocab_size * self.tp_local_rank
+            vocab_end_index = vocab_start_index + partition_vocab_size
+            # Create a mask of valid vocab ids (1 means it needs to be masked).
+            target_mask = (shift_labels < vocab_start_index) | (shift_labels >= vocab_end_index)
+            masked_target = shift_labels - vocab_start_index
+            masked_target[target_mask] = 0
+            # Get predicted-logits = logits[target].
+            # For Simplicity, we model_hf logits to a 2-D tensor with size
+            # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+            logits_2d = shift_logits.view(-1, partition_vocab_size)
+            masked_target_1d = masked_target.view(-1)
+            arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
+            predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+            predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+            predicted_logits = predicted_logits_1d.view_as(shift_labels)  # bsz x max_len
+            predicted_logits[target_mask] = 0.0
+            # All reduce is needed to get the chunks from other GPUs.
+            torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)
+            pred_exp_logits = torch.exp(predicted_logits)
+            # Sum of exponential of logits along vocab dimension across all GPUs.
+            sum_exp_logits = torch.exp(shift_logits).sum(dim=-1)
+            torch.distributed.all_reduce(sum_exp_logits, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)
+            total_log_probs = -(pred_exp_logits / sum_exp_logits).log().masked_fill(shift_labels.eq(-100), 0).sum()
+            self.total_log_probs += total_log_probs
+    def get_metric(self, reset=True):
+        if is_no_pp_or_last_stage() and self.dp_pg is not None:
+            torch.distributed.all_reduce(self.right, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
+            torch.distributed.all_reduce(self.total, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
+            torch.distributed.all_reduce(self.total_log_probs, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
+            if hasattr(self, "total_type_count"):
+                torch.distributed.all_reduce(self.ds_right, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
+                torch.distributed.all_reduce(self.ds_tokens, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
+            if self.tokenizer:
+                torch.distributed.all_reduce(self.total_bytes, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
+        acc = round((self.right / self.total).item(), 4)
+        perplexity = round(torch.exp(self.total_log_probs / self.total).item(), 4)
+        bits_per_bytes = round((self.total_log_probs / self.total_bytes).item(), 4) if self.tokenizer else 0
+        if hasattr(self, "total_type_count"):
+            ds_acc = {}
+            ds_tokens = {}
+            for i in range(self.total_type_count):
+                ds_acc[f"acc/{self.dataset_types[i]}"] = round(
+                    (self.ds_right[i].float() / (self.ds_tokens[i].float() + 1e-5)).item(), 4
+                )
+                ds_tokens[f"tokens/{self.dataset_types[i]}"] = self.ds_tokens[i].item()
+        if reset:
+            self.right.fill_(0)
+            self.total.fill_(0)
+            self.total_log_probs.fill_(0)
+            self.total_bytes.fill_(0)
+            if hasattr(self, "total_type_count"):
+                self.ds_right.fill_(0)
+                self.ds_tokens.fill_(0)
+        if self.tokenizer is not None:
+            res = {"acc": acc, "perplexity": perplexity, "BPB": bits_per_bytes}
+        else:
+            res = {"acc": acc, "perplexity": perplexity}
+        if hasattr(self, "total_type_count"):
+            res.update(ds_acc)
+            res.update(ds_tokens)
+        loss_res = self.loss_with_type_id.get_metric(reset)
+        res.update(loss_res)
+        return res
+class LossWithTypeId:
+    """
+    Notice the loss value computed here may be not the same with the main info loss,
+    cause loss here is the reduced result of the data parallel.
+    """
+    def __init__(self, device, dp_pg, dataset_types: List[str] = None) -> None:
+        self.device = device
+        self.dp_pg = dp_pg
+        self.loss = torch.Tensor([0.0]).to(device=device)
+        self.token_num = torch.Tensor([0.0]).to(device=device)
+        if dataset_types is not None:
+            self.dataset_types = dataset_types
+            self.total_type_count = len(dataset_types)
+            self.ds_loss = torch.zeros(self.total_type_count, dtype=torch.float, device=device)
+            self.ds_token_num = torch.zeros(self.total_type_count, dtype=torch.float, device=device)
+        self.loss_fn = FlashCrossEntropyLoss(
+            reduction="none", inplace_backward=True, process_group=gpc.get_group(ParallelMode.TENSOR)
+        )
+    def update(self, logits, labels, type_ids=None):
+        with torch.no_grad():
+            if isinstance(logits, (list, tuple)):
+                logits = logits[0]
+            logits = logits.contiguous().view(-1, logits.size(-1))
+            labels = labels.contiguous().view(-1)
+            loss_list = self.loss_fn(logits, labels)
+            cond = labels != -100
+            real_loss_list = loss_list[cond]
+            self.loss += real_loss_list.sum()
+            self.token_num += real_loss_list.numel()
+            if hasattr(self, "total_type_count"):
+                type_ids = type_ids.contiguous().view(-1).to(self.device)
+                real_type_ids = type_ids[cond]
+                loss_list_type = scatter(real_loss_list, real_type_ids, dim=0, reduce="sum")
+                token_num_type = scatter(torch.ones_like(real_loss_list), real_type_ids, dim=0, reduce="sum")
+                if len(loss_list_type) < self.total_type_count:
+                    loss_list_type = torch.cat(
+                        [loss_list_type, loss_list_type.new_zeros(self.total_type_count - len(loss_list_type))]
+                    )
+                    token_num_type = torch.cat(
+                        [token_num_type, token_num_type.new_zeros(self.total_type_count - len(token_num_type))]
+                    )
+                self.ds_loss += loss_list_type
+                self.ds_token_num += token_num_type
+    def get_metric(self, reset=True):
+        if is_no_pp_or_last_stage() and self.dp_pg is not None:
+            torch.distributed.all_reduce(self.loss, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
+            torch.distributed.all_reduce(self.token_num, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
+            if hasattr(self, "total_type_count"):
+                torch.distributed.all_reduce(self.ds_loss, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
+                torch.distributed.all_reduce(self.ds_token_num, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
+        loss = round((self.loss / self.token_num).item(), 4)
+        res = {
+            "loss_from_metric": loss,
+        }
+        if hasattr(self, "total_type_count"):
+            ds_loss = {}
+            for i in range(self.total_type_count):
+                ds_loss[f"loss/{self.dataset_types[i]}"] = round((self.ds_loss[i] / self.ds_token_num[i]).item(), 4)
+            res.update(ds_loss)
+        if reset:
+            self.loss.fill_(0.0)
+            self.token_num.fill_(0.0)
+            if hasattr(self, "total_type_count"):
+                self.ds_loss.fill_(0.0)
+                self.ds_token_num.fill_(0.0)
+        return res

InternLM/internlm/model/modeling_internlm.py ADDED Viewed

	@@ -0,0 +1,524 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import math
+from typing import Optional
+import torch
+from flash_attn.modules.embedding import ParallelGPT2Embeddings
+from flash_attn.modules.mlp import ParallelFusedMLP
+from torch import nn
+from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
+from internlm.core.context.parallel_context import global_context as gpc
+from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal
+from internlm.model.embedding import Embedding1D, Embedding1DLVM
+from internlm.model.linear import (
+    FeedForward,
+    RewardModelLinear,
+    ScaleColumnParallelLinear,
+)
+from internlm.model.multi_head_attention import MHA
+from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm
+from internlm.solver.pipeline_utils import partition_uniform
+from internlm.utils.checkpoint import activation_checkpoint
+from internlm.utils.common import filter_kwargs
+from internlm.utils.logger import get_logger
+from internlm.utils.registry import MODEL_INITIALIZER
+MODEL_TYPE = "INTERNLM"
+logger = get_logger(__file__)
+RMSNorm = try_import_RMSNorm()
+class PackedFlashBaseLayer1D(nn.Module):
+    """
+    1D Packed Flash Base Layer.
+    Args:
+        hidden_size (int): The hidden size of model. 768 by default.
+        num_attention_heads (int): The number of attention heads. 12 by default.
+        mlp_ratio (int): The ratio of MLP layers. 4 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0 by default.
+        drop_rate (float): The dropout rate of the input hidden state. 0.0 by default.
+        dtype (torch.dtype): Type of data. torch.float by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-5 by default.
+        checkpoint (bool): Whether to use checkpointing to save VRAM. True by default.
+        layer_idx (int): The index of current layer. 0 by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        norm_type (str): Use RMS norm or layernorm."rmsnorm" by default.
+        use_flash_attn (bool): Whether use flash-attn. True by default.
+    """
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_attention_heads: int = 12,
+        mlp_ratio: int = 4,
+        attn_drop_rate: float = 0,
+        drop_rate: float = 0.0,
+        dtype: torch.dtype = torch.float,
+        layer_norm_epsilon: float = 1e-6,
+        checkpoint: bool = False,
+        layer_idx: int = 0,
+        residual_in_fp32: bool = False,
+        device: Optional[torch.device] = None,
+        norm_type: str = "rmsnorm",
+        dropout_selective_checkpoint: bool = True,
+        use_scaled_init: bool = True,
+        use_swiglu: bool = True,
+        use_flash_attn: bool = True,
+    ):
+        super().__init__()
+        self.checkpoint = checkpoint
+        # dropout selective checkpoint can only be enabled when checkpoint is disabled.
+        self.dropout_selective_checkpoint = dropout_selective_checkpoint is True and checkpoint is False
+        self.layer_idx = layer_idx
+        self.use_flash_attn = use_flash_attn
+        head_dim = hidden_size // num_attention_heads
+        self.mixer = MHA(
+            embed_dim=hidden_size,
+            num_heads=num_attention_heads,
+            process_group=gpc.get_group(ParallelMode.TENSOR),
+            dropout=attn_drop_rate,
+            softmax_scale=1 / math.sqrt(head_dim),
+            causal=True,
+            layer_idx=layer_idx,
+            rotary_emb_dim=head_dim,
+            rotary_emb_scale_base=0,
+            use_flash_attn=use_flash_attn,
+            device=device,
+            dtype=dtype,
+        )
+        self.dropout1 = nn.Dropout(drop_rate)
+        if norm_type == "rmsnorm":
+            self.norm1 = RMSNorm(hidden_size, eps=layer_norm_epsilon)
+            self.norm2 = RMSNorm(hidden_size, eps=layer_norm_epsilon)
+        else:
+            self.norm1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+        if use_swiglu:
+            self.mlp = FeedForward(
+                hidden_size,
+                int(hidden_size * mlp_ratio),
+                out_features=hidden_size,
+                process_group=gpc.get_group(ParallelMode.TENSOR),
+                bias=False,
+                device=device,
+                dtype=dtype,
+            )
+        else:
+            self.mlp = ParallelFusedMLP(
+                hidden_size,
+                int(hidden_size * mlp_ratio),
+                out_features=hidden_size,
+                activation="gelu_approx",
+                process_group=gpc.get_group(ParallelMode.TENSOR),
+                bias1=False,
+                bias2=False,
+                sequence_parallel=gpc.config.parallel.sequence_parallel,
+                checkpoint_lvl=0,
+                heuristic="auto",
+                device=device,
+                dtype=dtype,
+            )
+        for _, param in self.mlp.named_parameters():
+            if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                setattr(param, IS_TENSOR_PARALLEL, True)
+        self.dropout2 = nn.Dropout(drop_rate)
+        self.use_swiglu = use_swiglu
+        self.use_scaled_init = use_scaled_init
+        self.residual_in_fp32 = residual_in_fp32  # only make sense when using prenorm
+        self.return_residual = False
+        self.reset_parameters()
+    def reset_parameters(self):
+        with torch.no_grad():
+            for name, param in self.mixer.named_parameters():
+                if param.ndim == 1:
+                    param.data.zero_()
+                elif "Wqkv" in name:
+                    normal_(std=0.006)(param.data)
+                elif self.use_scaled_init:
+                    scaled_init_method_normal(sigma=0.006, num_layers=self.layer_idx + 1)(param.data)
+                else:
+                    normal_(std=0.0015)(param.data)
+            for name, param in self.mlp.named_parameters():
+                if param.ndim == 1 and "bias" in name:
+                    param.data.zero_()
+                elif self.use_swiglu:
+                    if self.use_scaled_init and "w2" in name:
+                        scaled_init_method_normal(sigma=0.006, num_layers=self.layer_idx + 1)(param.data)
+                    else:
+                        normal_(std=0.006 if "w1" in name or "w2" in name else 0.0015)(param.data)
+                else:
+                    if self.use_scaled_init and "fc1" not in name:
+                        scaled_init_method_normal(sigma=0.006, num_layers=self.layer_idx + 1)(param.data)
+                    else:
+                        normal_(std=0.006 if "fc1" in name else 0.0015)(param.data)
+    def forward(self, hidden_states, cu_seqlens=None, indexes=None, inference_params=None, max_seqlen=None):
+        if self.checkpoint and self.training:
+            return activation_checkpoint(
+                self._forward, False, hidden_states, cu_seqlens, indexes, inference_params, max_seqlen
+            )
+        else:
+            return self._forward(hidden_states, cu_seqlens, indexes, inference_params, max_seqlen)
+    def _forward(self, hidden_states=None, cu_seqlens=None, indexes=None, inference_params=None, max_seqlen=None):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Attn/MLP(LN(residual))
+            cu_seqlens: 1d LongTensor, len(cu_seqlens) = hidden_states + 1
+            indexes: the length of index is same as hidden states, which stand for the current position
+        """
+        mixer_kwargs = {
+            "cu_seqlens": cu_seqlens,
+            "max_seqlen": max_seqlen,
+            "indexes": indexes,
+            "inference_params": inference_params,
+        }
+        def _dropout_and_norm_attn(_hidden_states):
+            _dropped = self.dropout1(_hidden_states)
+            _residual = _dropped
+            _hidden_states = self.norm1(_residual.float())
+            return _residual, _hidden_states
+        if self.dropout_selective_checkpoint:
+            residual, hidden_states = activation_checkpoint(_dropout_and_norm_attn, False, hidden_states)
+        else:
+            residual, hidden_states = _dropout_and_norm_attn(hidden_states)
+        if self.residual_in_fp32:
+            residual = residual.to(torch.float32)
+        hidden_states = self.mixer(hidden_states, **mixer_kwargs)
+        def _dropout_and_norm_ffn(_residual, _hidden_states):
+            _dropped = self.dropout2(_hidden_states)
+            _residual = (_dropped + _residual) if _residual is not None else _dropped
+            _hidden_states = self.norm2(_residual.float())
+            return _residual, _hidden_states
+        if self.dropout_selective_checkpoint:
+            residual, hidden_states = activation_checkpoint(_dropout_and_norm_ffn, False, residual, hidden_states)
+        else:
+            residual, hidden_states = _dropout_and_norm_ffn(residual, hidden_states)
+        if self.residual_in_fp32:
+            residual = residual.to(torch.float32)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states + residual
+class PackedFlashInternLm1D(nn.Module):
+    """
+    1D Packed Flash InternLm.
+    Args:
+        num_layers (int): The number of layer. 12 by default.
+        hidden_size (int): The size of hidden state. 768 by default.
+        num_attention_heads (int): The number of attention head. 12 by default.
+        vocab_size (int): The size of vocabulary. 50304 by default.
+        mlp_ratio (int): The ratio of MLP layers. 4 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0.0 by default.
+        drop_rate (float): The dropout rate of input hidden state. 0.0 by default.
+        dtype (torch.dtype): The type of data. torch.float by default.
+        checkpoint (float): The proportion of layers that need to be checkpointed compared to the total number
+                                    of layers. 0.0 by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-6 by default.
+        first (bool): Whether input embedding layer or not. False by default.
+        last (bool): Whether output embedding layer or not. False by default.
+        embed_split_hidden (bool): Split the embedding layer in the hidden state dimention or vocabulary dimention.
+                                    True by default.
+        embed_grad_scale (float): Refer to GLM-130B, for training stability. 0.1 by default.
+        parallel_output (bool): If it is necessary to collect the output of parallel computing. True by default.
+        start_layer_idx (int): The index of start layer in the pipeline. 0 by default.
+        device (Optional[Union[str, torch.device]]): The device will be used. None by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
+        norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
+        use_flash_attn (bool): Whether to use flash-attn. True by default.
+    """
+    def __init__(
+        self,
+        num_layers: int = 12,
+        hidden_size: int = 768,
+        num_attention_heads: int = 12,
+        vocab_size: int = 50304,
+        mlp_ratio: int = 4.0,
+        attn_drop_rate: float = 0.0,
+        drop_rate: float = 0.0,
+        dtype: torch.dtype = torch.float,
+        checkpoint: float = 0.0,
+        layer_norm_epsilon: float = 1e-5,
+        first: bool = False,
+        last: bool = False,
+        embed_split_hidden: bool = False,
+        embed_grad_scale: float = 0.1,
+        parallel_output: bool = True,
+        start_layer_idx: int = 0,
+        device: Optional[torch.device] = None,
+        residual_in_fp32: bool = False,
+        norm_type: str = "rmsnorm",
+        is_reward: bool = False,
+        dropout_selective_checkpoint: bool = True,
+        use_scaled_init: bool = True,
+        use_swiglu: bool = True,
+        use_flash_attn: bool = True,
+        lvm_config: dict = None,
+    ):
+        super().__init__()
+        self.lvm_config = lvm_config
+        checkpoint_layer_num = int(num_layers * checkpoint)
+        if is_reward:
+            head_cls = RewardModelLinear
+        else:
+            head_cls = ScaleColumnParallelLinear
+        if first:
+            if self.lvm_config.get('enable', False):
+                self.embedding = Embedding1DLVM(**self.lvm_config.get('embedding_cfg'))
+                if self.embedding.embed_proj is not None:
+                    for _, param in self.embedding.embed_proj.named_parameters():
+                        normal_(std=0.0052)(param)
+                        if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                            setattr(param, IS_TENSOR_PARALLEL, True)
+            else:
+                if embed_split_hidden:
+                    self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
+                else:
+                    self.embedding = ParallelGPT2Embeddings(
+                        embed_dim=hidden_size,
+                        vocab_size=vocab_size,
+                        max_position_embeddings=-1,
+                        process_group=gpc.get_group(ParallelMode.TENSOR),
+                        padding_idx=None,
+                        sequence_parallel=gpc.config.parallel.sequence_parallel,
+                        device=device,
+                        dtype=dtype,
+                    )
+                for _, param in self.embedding.named_parameters():
+                    normal_(std=0.0052)(param)
+                    if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                        setattr(param, IS_TENSOR_PARALLEL, True)
+        self.embed_grad_scale = embed_grad_scale
+        self.blocks = nn.ModuleList(
+            [
+                PackedFlashBaseLayer1D(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    mlp_ratio=mlp_ratio,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    dtype=dtype,
+                    layer_norm_epsilon=layer_norm_epsilon,
+                    checkpoint=lid < checkpoint_layer_num,
+                    layer_idx=lid + start_layer_idx,  # This parameter is used for caching during generation
+                    residual_in_fp32=residual_in_fp32,
+                    device=device,
+                    norm_type=norm_type,
+                    dropout_selective_checkpoint=dropout_selective_checkpoint,
+                    use_scaled_init=use_scaled_init,
+                    use_swiglu=use_swiglu,
+                    use_flash_attn=use_flash_attn,
+                )
+                for lid in range(num_layers)
+            ]
+        )
+        if last:
+            if norm_type == "rmsnorm":
+                self.norm = RMSNorm(hidden_size, eps=layer_norm_epsilon)
+            else:
+                self.norm = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            self.head = head_cls(
+                in_features=hidden_size,
+                out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size,
+                process_group=gpc.get_group(ParallelMode.TENSOR),
+                bias=False,
+                device=device,
+                dtype=dtype,
+                weight_scale=embed_grad_scale,
+            )
+            for _, param in self.head.named_parameters():
+                normal_(std=0.0052)(param)
+                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                    setattr(param, IS_TENSOR_PARALLEL, True)
+        self.parallel_output = parallel_output
+    def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None):
+        # attention_mask: compute attention on the places where the value is 1
+        if hasattr(self, "embedding"):
+            hidden_states = self.embedding(input_ids)
+            if self.embed_grad_scale != 1:
+                hidden_states = (
+                    self.embed_grad_scale * hidden_states + (1 - self.embed_grad_scale) * hidden_states.detach()
+                )
+        if isinstance(cu_seqlens, list):
+            assert len(cu_seqlens) == 1
+            cu_seqlens = cu_seqlens[0].to(hidden_states.device)
+        if cu_seqlens is not None:
+            cu_seqlens = cu_seqlens.squeeze(0)
+            hidden_states = hidden_states.squeeze(0)  # If cu_seqlens is passed in，it indicated a packed state，
+            # the batch dimension with a size of 1 should be directly squeezed off.
+        if indexes is not None:
+            assert len(indexes) == 1
+            # The indexes are used to indicate the actual position IDs of each token in the packed input.
+            indexes = indexes[0]
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None
+        for _, block in enumerate(self.blocks):
+            hidden_states = block(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                indexes=indexes,
+                inference_params=inference_params,
+                max_seqlen=max_seqlen,
+            )
+        if hasattr(self, "norm"):
+            hidden_states = self.norm(hidden_states.float())
+        if hasattr(self, "head"):
+            hidden_states = self.head(hidden_states)
+        if not self.parallel_output:
+            hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
+        return hidden_states
+def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"), **kwargs):
+    """
+    build generic model 1d
+    Args:
+        num_layers (int): The number of layer.
+        num_chunks (int): The number of partitions in pipeline parallel.
+        device (Optional[Union[str, torch.device]]): The device will be used. torch.device("cuda") by default.
+    """
+    pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
+    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    all_parts = partition_uniform(num_layers, pipeline_size, num_chunks)
+    parts = all_parts[pipeline_rank]
+    if gpc.is_rank_for_log():
+        logger.info(f"The layer sharding is {all_parts}.")
+    models = []
+    for start, end in parts:
+        kwargs["num_layers"] = end - start
+        kwargs["first"] = start == 0
+        # If there is no content in the final layer, assign the last layer.
+        kwargs["last"] = end == num_layers and len(all_parts[-1]) != 0
+        kwargs["device"] = device
+        kwargs["start_layer_idx"] = start
+        chunk = PackedFlashInternLm1D(**filter_kwargs(PackedFlashInternLm1D.__init__, kwargs)).to(device)
+        models.append(chunk)
+    torch.distributed.barrier()
+    if len(models) == 1:
+        model = models[0]
+    else:
+        model = nn.ModuleList(models)
+    return model
+@MODEL_INITIALIZER.register_module(module_name=MODEL_TYPE)
+def build_model_with_cfg(
+    num_chunks=1,
+    checkpoint=0.0,
+    dtype=torch.float,
+    embed_split_hidden=False,
+    num_layers=48,
+    hidden_size=2048,
+    vocab_size=50304,
+    embed_grad_scale=1,
+    parallel_output=True,
+    num_attention_heads=32,
+    mlp_ratio=4.0,
+    residual_in_fp32=False,
+    norm_type="rmsnorm",
+    drop_rate=0,
+    attn_drop_rate=0,
+    apply_post_layer_norm=False,  # pylint: disable=W0613
+    layer_norm_epsilon=1e-5,
+    is_reward=False,
+    dropout_selective_checkpoint=True,
+    use_scaled_init: bool = True,
+    use_swiglu: bool = True,
+    use_flash_attn: bool = True,
+    lvm_config=None,
+):
+    """
+    Build model with config.
+    Args:
+        num_chunks (int): The number of partitions in pipeline parallel. 1 by default.
+        checkpoint (bool): Whether to use checkpointing to save VRAM. False by default.
+        dtype (torch.dtype): The type of data. torch.float by default.
+        embed_split_hidden (bool): Split the embedding layer in the hidden state dimention or vocabulary dimention.
+                                    False by default.
+        num_layers (int): The number of layer. 48 by default.
+        hidden_size (int): The size of hidden state. 2048 by default.
+        vocab_size (int): The size of vocabulary. 50304 by default.
+        embed_grad_scale (float): Refer to GLM-130B, for training stability. 0.1 by default.
+        parallel_output (bool): If it is necessary to collect the output of parallel computing. True by default.
+        num_attention_heads (int): The number of attention head. 32 by default.
+        mlp_ratio (int): The ratio of MLP layers. 4.0 by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default. It cannot be used temporarily
+                                 because this parameter requires inconsistent data types to be passed between pipelines,
+                                 which requires significant modifications to internlm.
+        norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
+        drop_rate (float): The dropout rate of input hidden state. 0 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0 by default.
+        apply_post_layer_norm (bool): Whether to apply post layer norm. False by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-5 by default.
+        is_reward (bool): Whether to use reward model. False by default.
+        dropout_selective_checkpoint (bool): It can only be enabled when checkpoint is disabled. True by default.
+        use_scaled_init (bool): Whether to use scaled init. True by default.
+        use_swiglu (bool): Whether to use swiglu. True by default.
+        use_flash_attn (bool): Whether to use flash-attn. True by default.
+    """
+    cfg = dict(
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        checkpoint=checkpoint,
+        dtype=dtype,
+        embed_split_hidden=embed_split_hidden,
+        vocab_size=vocab_size,
+        embed_grad_scale=embed_grad_scale,
+        parallel_output=parallel_output,
+        mlp_ratio=mlp_ratio,
+        residual_in_fp32=residual_in_fp32,
+        norm_type=norm_type,
+        drop_rate=drop_rate,
+        attn_drop_rate=attn_drop_rate,
+        layer_norm_epsilon=layer_norm_epsilon,
+        is_reward=is_reward,
+        dropout_selective_checkpoint=dropout_selective_checkpoint,
+        use_scaled_init=use_scaled_init,
+        use_swiglu=use_swiglu,
+        use_flash_attn=use_flash_attn,
+        lvm_config=lvm_config,
+    )
+    return _build_generic_model_1d(num_layers=num_layers, num_chunks=num_chunks, **cfg)

InternLM/internlm/model/modeling_vit.py ADDED Viewed

	@@ -0,0 +1,527 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import math
+from typing import Optional
+import torch
+from flash_attn.modules.embedding import ParallelGPT2Embeddings
+from flash_attn.modules.mlp import ParallelFusedMLP
+from torch import nn
+from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
+from internlm.core.context.parallel_context import global_context as gpc
+from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal
+from internlm.model.embedding import Embedding1D, Embedding1DLVM
+from internlm.model.linear import (
+    FeedForward,
+    RewardModelLinear,
+    ScaleColumnParallelLinear,
+)
+from internlm.model.multi_head_attention import MHA
+from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm, try_import_LayerNorm
+from internlm.solver.pipeline_utils import partition_uniform
+from internlm.utils.checkpoint import activation_checkpoint
+from internlm.utils.common import filter_kwargs
+from internlm.utils.logger import get_logger
+from internlm.utils.registry import MODEL_INITIALIZER
+MODEL_TYPE = "ViT"
+logger = get_logger(__file__)
+RMSNorm = try_import_RMSNorm()
+LayerNorm = try_import_LayerNorm()
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+class PackedFlashBaseLayer1D(nn.Module):
+    """
+    1D Packed Flash Base Layer.
+    Args:
+        hidden_size (int): The hidden size of model. 768 by default.
+        num_attention_heads (int): The number of attention heads. 12 by default.
+        mlp_ratio (int): The ratio of MLP layers. 4 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0 by default.
+        drop_path_rate (float): The drop path rate of the input hidden state. 0.0 by default.
+        dtype (torch.dtype): Type of data. torch.float by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-5 by default.
+        checkpoint (bool): Whether to use checkpointing to save VRAM. True by default.
+        layer_idx (int): The index of current layer. 0 by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        norm_type (str): Use RMS norm or layernorm."rmsnorm" by default.
+        use_flash_attn (bool): Whether use flash-attn. True by default.
+    """
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_attention_heads: int = 12,
+        mlp_ratio: int = 4,
+        mlp_bias: bool = False,
+        attn_drop_rate: float = 0,
+        drop_path_rate: float = 0.0,
+        dtype: torch.dtype = torch.float,
+        layer_norm_epsilon: float = 1e-6,
+        checkpoint: bool = False,
+        layer_idx: int = 0,
+        residual_in_fp32: bool = False,
+        device: Optional[torch.device] = None,
+        norm_type: str = "rmsnorm",
+        dropout_selective_checkpoint: bool = True,
+        use_scaled_init: bool = True,
+        use_swiglu: bool = True,
+        use_flash_attn: bool = True,
+    ):
+        super().__init__()
+        self.checkpoint = checkpoint
+        # dropout selective checkpoint can only be enabled when checkpoint is disabled.
+        self.dropout_selective_checkpoint = dropout_selective_checkpoint is True and checkpoint is False
+        self.layer_idx = layer_idx
+        self.use_flash_attn = use_flash_attn
+        head_dim = hidden_size // num_attention_heads
+        self.mixer = MHA(
+            embed_dim=hidden_size,
+            num_heads=num_attention_heads,
+            process_group=gpc.get_group(ParallelMode.TENSOR),
+            dropout=attn_drop_rate,
+            softmax_scale=1 / math.sqrt(head_dim),
+            causal=True,
+            layer_idx=layer_idx,
+            rotary_emb_dim=head_dim,
+            rotary_emb_scale_base=0,
+            use_flash_attn=use_flash_attn,
+            device=device,
+            dtype=dtype,
+        )
+        self.dropout1 = DropPath(drop_path_rate)
+        if norm_type == "rmsnorm":
+            self.norm1 = RMSNorm(hidden_size, eps=layer_norm_epsilon)
+            self.norm2 = RMSNorm(hidden_size, eps=layer_norm_epsilon)
+        else:
+            self.norm1 = LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            self.norm2 = LayerNorm(hidden_size, eps=layer_norm_epsilon)
+        self.mlp = ParallelFusedMLP(
+            hidden_size,
+            int(hidden_size * mlp_ratio),
+            out_features=hidden_size,
+            activation="gelu_approx",
+            process_group=gpc.get_group(ParallelMode.TENSOR),
+            bias1=mlp_bias,
+            bias2=mlp_bias,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            checkpoint_lvl=0,
+            heuristic="auto",
+            device=device,
+            dtype=dtype,
+        )
+        for _, param in self.mlp.named_parameters():
+            if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                setattr(param, IS_TENSOR_PARALLEL, True)
+        self.dropout2 = DropPath(drop_path_rate)
+        self.use_swiglu = use_swiglu
+        self.use_scaled_init = use_scaled_init
+        self.residual_in_fp32 = residual_in_fp32  # only make sense when using prenorm
+        self.return_residual = False
+        self.reset_parameters()
+    def reset_parameters(self):
+        with torch.no_grad():
+            for name, param in self.mixer.named_parameters():
+                if param.ndim == 1:
+                    param.data.zero_()
+                elif "Wqkv" in name:
+                    normal_(std=0.006)(param.data)
+                elif self.use_scaled_init:
+                    scaled_init_method_normal(sigma=0.006, num_layers=self.layer_idx + 1)(param.data)
+                else:
+                    normal_(std=0.0015)(param.data)
+            for name, param in self.mlp.named_parameters():
+                if param.ndim == 1 and "bias" in name:
+                    param.data.zero_()
+                elif self.use_swiglu:
+                    if self.use_scaled_init and "w2" in name:
+                        scaled_init_method_normal(sigma=0.006, num_layers=self.layer_idx + 1)(param.data)
+                    else:
+                        normal_(std=0.006 if "w1" in name or "w2" in name else 0.0015)(param.data)
+                else:
+                    if self.use_scaled_init and "fc1" not in name:
+                        scaled_init_method_normal(sigma=0.006, num_layers=self.layer_idx + 1)(param.data)
+                    else:
+                        normal_(std=0.006 if "fc1" in name else 0.0015)(param.data)
+    def forward(self, hidden_states, cu_seqlens=None, indexes=None, inference_params=None, max_seqlen=None):
+        if self.checkpoint and self.training:
+            return activation_checkpoint(
+                self._forward, False, hidden_states, cu_seqlens, indexes, inference_params, max_seqlen
+            )
+        else:
+            return self._forward(hidden_states, cu_seqlens, indexes, inference_params, max_seqlen)
+    def _forward(self, hidden_states=None, cu_seqlens=None, indexes=None, inference_params=None, max_seqlen=None):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Attn/MLP(LN(residual))
+            cu_seqlens: 1d LongTensor, len(cu_seqlens) = hidden_states + 1
+            indexes: the length of index is same as hidden states, which stand for the current position
+        """
+        mixer_kwargs = {
+            "cu_seqlens": cu_seqlens,
+            "max_seqlen": max_seqlen,
+            "indexes": indexes,
+            "inference_params": inference_params,
+        }
+        residual = hidden_states
+        hidden_states = self.norm1(residual.float())
+        hidden_states = self.mixer(hidden_states, **mixer_kwargs)
+        hidden_states = self.dropout1(hidden_states)
+        residual = residual + hidden_states
+        hidden_states = self.norm2(residual.float())
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.dropout2(hidden_states)
+        return hidden_states + residual
+class PackedFlashInternLm1D(nn.Module):
+    """
+    1D Packed Flash InternLm.
+    Args:
+        num_layers (int): The number of layer. 12 by default.
+        hidden_size (int): The size of hidden state. 768 by default.
+        num_attention_heads (int): The number of attention head. 12 by default.
+        vocab_size (int): The size of vocabulary. 50304 by default.
+        mlp_ratio (int): The ratio of MLP layers. 4 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0.0 by default.
+        drop_path_rate (float): The drop path rate of input hidden state. 0.0 by default.
+        dtype (torch.dtype): The type of data. torch.float by default.
+        checkpoint (float): The proportion of layers that need to be checkpointed compared to the total number
+                                    of layers. 0.0 by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-6 by default.
+        first (bool): Whether input embedding layer or not. False by default.
+        last (bool): Whether output embedding layer or not. False by default.
+        embed_split_hidden (bool): Split the embedding layer in the hidden state dimention or vocabulary dimention.
+                                    True by default.
+        embed_grad_scale (float): Refer to GLM-130B, for training stability. 0.1 by default.
+        parallel_output (bool): If it is necessary to collect the output of parallel computing. True by default.
+        start_layer_idx (int): The index of start layer in the pipeline. 0 by default.
+        device (Optional[Union[str, torch.device]]): The device will be used. None by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
+        norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
+        use_flash_attn (bool): Whether to use flash-attn. True by default.
+    """
+    def __init__(
+        self,
+        num_layers: int = 12,
+        hidden_size: int = 768,
+        num_attention_heads: int = 12,
+        vocab_size: int = 50304,
+        mlp_ratio: int = 4.0,
+        mlp_bias: bool = False,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        dtype: torch.dtype = torch.float,
+        checkpoint: float = 0.0,
+        layer_norm_epsilon: float = 1e-5,
+        first: bool = False,
+        last: bool = False,
+        embed_split_hidden: bool = False,
+        embed_grad_scale: float = 0.1,
+        parallel_output: bool = True,
+        start_layer_idx: int = 0,
+        device: Optional[torch.device] = None,
+        residual_in_fp32: bool = False,
+        norm_type: str = "rmsnorm",
+        is_reward: bool = False,
+        dropout_selective_checkpoint: bool = True,
+        use_scaled_init: bool = True,
+        use_swiglu: bool = True,
+        use_flash_attn: bool = True,
+        lvm_config: dict = None,
+    ):
+        super().__init__()
+        self.lvm_config = lvm_config
+        checkpoint_layer_num = int(num_layers * checkpoint)
+        head_cls = ScaleColumnParallelLinear
+        if first:
+            if self.lvm_config.get('enable', False):
+                self.embedding = Embedding1DLVM(**self.lvm_config.get('embedding_cfg'))
+                if self.embedding.embed_proj is not None:
+                    for _, param in self.embedding.embed_proj.named_parameters():
+                        normal_(std=0.0052)(param)
+                        if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                            setattr(param, IS_TENSOR_PARALLEL, True)
+            else:
+                if embed_split_hidden:
+                    self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
+                else:
+                    self.embedding = ParallelGPT2Embeddings(
+                        embed_dim=hidden_size,
+                        vocab_size=vocab_size,
+                        max_position_embeddings=-1,
+                        process_group=gpc.get_group(ParallelMode.TENSOR),
+                        padding_idx=None,
+                        sequence_parallel=gpc.config.parallel.sequence_parallel,
+                        device=device,
+                        dtype=dtype,
+                    )
+                for _, param in self.embedding.named_parameters():
+                    normal_(std=0.0052)(param)
+                    if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                        setattr(param, IS_TENSOR_PARALLEL, True)
+        self.embed_grad_scale = embed_grad_scale
+        self.blocks = nn.ModuleList(
+            [
+                PackedFlashBaseLayer1D(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    mlp_ratio=mlp_ratio,
+                    mlp_bias=mlp_bias,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=drop_path_rate,
+                    dtype=dtype,
+                    layer_norm_epsilon=layer_norm_epsilon,
+                    checkpoint=lid < checkpoint_layer_num,
+                    layer_idx=lid + start_layer_idx,  # This parameter is used for caching during generation
+                    residual_in_fp32=residual_in_fp32,
+                    device=device,
+                    norm_type=norm_type,
+                    dropout_selective_checkpoint=dropout_selective_checkpoint,
+                    use_scaled_init=use_scaled_init,
+                    use_swiglu=use_swiglu,
+                    use_flash_attn=use_flash_attn,
+                )
+                for lid in range(num_layers)
+            ]
+        )
+        if last:
+            if norm_type == "rmsnorm":
+                self.norm = RMSNorm(hidden_size, eps=layer_norm_epsilon)
+            else:
+                self.norm = LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            self.head = head_cls(
+                in_features=hidden_size,
+                out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size,
+                process_group=gpc.get_group(ParallelMode.TENSOR),
+                bias=False,
+                device=device,
+                dtype=dtype,
+                weight_scale=embed_grad_scale,
+            )
+            for _, param in self.head.named_parameters():
+                normal_(std=0.0052)(param)
+                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                    setattr(param, IS_TENSOR_PARALLEL, True)
+        self.parallel_output = parallel_output
+    def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None):
+        # attention_mask: compute attention on the places where the value is 1
+        if hasattr(self, "embedding"):
+            hidden_states = self.embedding(input_ids)
+            if self.embed_grad_scale != 1:
+                hidden_states = (
+                    self.embed_grad_scale * hidden_states + (1 - self.embed_grad_scale) * hidden_states.detach()
+                )
+        if isinstance(cu_seqlens, list):
+            assert len(cu_seqlens) == 1
+            cu_seqlens = cu_seqlens[0].to(hidden_states.device)
+        if cu_seqlens is not None:
+            cu_seqlens = cu_seqlens.squeeze(0)
+            hidden_states = hidden_states.squeeze(0)  # If cu_seqlens is passed in，it indicated a packed state，
+            # the batch dimension with a size of 1 should be directly squeezed off.
+        if indexes is not None:
+            assert len(indexes) == 1
+            # The indexes are used to indicate the actual position IDs of each token in the packed input.
+            indexes = indexes[0]
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None
+        for _, block in enumerate(self.blocks):
+            hidden_states = block(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                indexes=indexes,
+                inference_params=inference_params,
+                max_seqlen=max_seqlen,
+            )
+        if hasattr(self, "norm"):
+            hidden_states = self.norm(hidden_states.float())
+        if hasattr(self, "head"):
+            hidden_states = self.head(hidden_states)
+        if not self.parallel_output:
+            hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
+        return hidden_states
+def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"), **kwargs):
+    """
+    build generic model 1d
+    Args:
+        num_layers (int): The number of layer.
+        num_chunks (int): The number of partitions in pipeline parallel.
+        device (Optional[Union[str, torch.device]]): The device will be used. torch.device("cuda") by default.
+    """
+    pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
+    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    all_parts = partition_uniform(num_layers, pipeline_size, num_chunks)
+    parts = all_parts[pipeline_rank]
+    if gpc.is_rank_for_log():
+        logger.info(f"The layer sharding is {all_parts}.")
+    models = []
+    for start, end in parts:
+        kwargs["num_layers"] = end - start
+        kwargs["first"] = start == 0
+        # If there is no content in the final layer, assign the last layer.
+        kwargs["last"] = end == num_layers and len(all_parts[-1]) != 0
+        kwargs["device"] = device
+        kwargs["start_layer_idx"] = start
+        chunk = PackedFlashInternLm1D(**filter_kwargs(PackedFlashInternLm1D.__init__, kwargs)).to(device)
+        models.append(chunk)
+    torch.distributed.barrier()
+    if len(models) == 1:
+        model = models[0]
+    else:
+        model = nn.ModuleList(models)
+    return model
+@MODEL_INITIALIZER.register_module(module_name=MODEL_TYPE)
+def build_vit_model_with_cfg(
+    num_chunks=1,
+    checkpoint=0.0,
+    dtype=torch.float,
+    embed_split_hidden=False,
+    num_layers=48,
+    hidden_size=2048,
+    vocab_size=50304,
+    embed_grad_scale=1,
+    parallel_output=True,
+    num_attention_heads=32,
+    mlp_ratio=4.0,
+    mlp_bias: bool = False,
+    residual_in_fp32=False,
+    norm_type="rmsnorm",
+    drop_path_rate=0,
+    attn_drop_rate=0,
+    apply_post_layer_norm=False,  # pylint: disable=W0613
+    layer_norm_epsilon=1e-5,
+    is_reward=False,
+    dropout_selective_checkpoint=True,
+    use_scaled_init: bool = True,
+    use_swiglu: bool = True,
+    use_flash_attn: bool = True,
+    lvm_config=None,
+):
+    """
+    Build model with config.
+    Args:
+        num_chunks (int): The number of partitions in pipeline parallel. 1 by default.
+        checkpoint (bool): Whether to use checkpointing to save VRAM. False by default.
+        dtype (torch.dtype): The type of data. torch.float by default.
+        embed_split_hidden (bool): Split the embedding layer in the hidden state dimention or vocabulary dimention.
+                                    False by default.
+        num_layers (int): The number of layer. 48 by default.
+        hidden_size (int): The size of hidden state. 2048 by default.
+        vocab_size (int): The size of vocabulary. 50304 by default.
+        embed_grad_scale (float): Refer to GLM-130B, for training stability. 0.1 by default.
+        parallel_output (bool): If it is necessary to collect the output of parallel computing. True by default.
+        num_attention_heads (int): The number of attention head. 32 by default.
+        mlp_ratio (int): The ratio of MLP layers. 4.0 by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default. It cannot be used temporarily
+                                 because this parameter requires inconsistent data types to be passed between pipelines,
+                                 which requires significant modifications to internlm.
+        norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
+        drop_path_rate (float): The drop path rate rate of input hidden state. 0 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0 by default.
+        apply_post_layer_norm (bool): Whether to apply post layer norm. False by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-5 by default.
+        is_reward (bool): Whether to use reward model. False by default.
+        dropout_selective_checkpoint (bool): It can only be enabled when checkpoint is disabled. True by default.
+        use_scaled_init (bool): Whether to use scaled init. True by default.
+        use_swiglu (bool): Whether to use swiglu. True by default.
+        use_flash_attn (bool): Whether to use flash-attn. True by default.
+    """
+    cfg = dict(
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        checkpoint=checkpoint,
+        dtype=dtype,
+        embed_split_hidden=embed_split_hidden,
+        vocab_size=vocab_size,
+        embed_grad_scale=embed_grad_scale,
+        parallel_output=parallel_output,
+        mlp_ratio=mlp_ratio,
+        mlp_bias=mlp_bias,
+        residual_in_fp32=residual_in_fp32,
+        norm_type=norm_type,
+        drop_path_rate=drop_path_rate,
+        attn_drop_rate=attn_drop_rate,
+        layer_norm_epsilon=layer_norm_epsilon,
+        is_reward=is_reward,
+        dropout_selective_checkpoint=dropout_selective_checkpoint,
+        use_scaled_init=use_scaled_init,
+        use_swiglu=use_swiglu,
+        use_flash_attn=use_flash_attn,
+        lvm_config=lvm_config,
+    )
+    return _build_generic_model_1d(num_layers=num_layers, num_chunks=num_chunks, **cfg)

InternLM/internlm/model/multi_head_attention.py ADDED Viewed

	@@ -0,0 +1,186 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from typing import Optional
+import torch
+from einops import rearrange
+from flash_attn.modules.mha import (
+    CrossAttention,
+    FlashCrossAttention,
+    FlashSelfAttention,
+    SelfAttention,
+    _update_kv_cache,
+)
+from torch import nn
+from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.model.embedding import RotaryEmbedding
+from internlm.model.linear import ColumnParallelLinearTorch, RowParallelLinearTorch
+class MHA(nn.Module):
+    """
+    Multi-head self-attention and cross-attention.
+    Args:
+        embed_dim (int): The dimention of hidden state.
+        num_heads (int): The number of attention heads.
+        process_group (torch.distributed.ProcessGroup): The group of the current device for `parallel_mode`.
+        bias (boolean): Whether the bias is needed for linears. Will be used when initializing QKV matrix and
+                        output projection. True by default.
+        dropout (float): The dropout rate for cross attention and self attention. 0.0 by default.
+        softmax_scale (float): The temperature to use for the softmax attention.
+        causal (boolean): Whether to apply causal attention mask. False by default.
+        layer_idx (int): The index of current layer. None by default.
+        rotary_emb_dim (int): The dimention of Rotary Embedding. 0 by default.
+        rotary_emb_scale_base (int): The scaling factor of Rotary Embedding. If scale_base > 0, this implements
+                                    XPos(Sun et al., https://arxiv.org/abs/2212.10554). 0 by default.
+        use_flash_attn (boolean): Whether to use flash attention or not.If False, vanilla attention module will be used.
+                                    False by default.
+        sequence_parallel (boolean): If True, we're doing Tensor Parallel with sequence parallelism. An all_gather_raw
+                                    of x will be done before doing the matmul.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        dtype (Optional[torch.dtype]): The type of data.
+        use_flash_attn (bool): Whether to use flash-attn. True by default.
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        process_group: Optional[torch.distributed.ProcessGroup],
+        dropout: float = 0.0,
+        softmax_scale: float = None,
+        causal: bool = False,
+        layer_idx: int = None,
+        rotary_emb_dim: int = 0,
+        rotary_emb_scale_base: int = 0,
+        use_flash_attn: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.layer_idx = layer_idx
+        self.rotary_emb_dim = rotary_emb_dim
+        self.use_flash_attn = use_flash_attn
+        self.num_heads = num_heads
+        assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads"
+        self.head_dim = self.embed_dim // num_heads
+        if self.rotary_emb_dim > 0:
+            self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device)
+        # notice here should change bias=True
+        self.Wqkv = ColumnParallelLinearTorch(
+            embed_dim,
+            3 * embed_dim,
+            process_group,
+            bias=True,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            **factory_kwargs,
+        )  # according to https://spaces.ac.cn/archives/9577
+        inner_attn_cls = FlashSelfAttention if use_flash_attn else SelfAttention
+        inner_cross_attn_cls = FlashCrossAttention if use_flash_attn else CrossAttention
+        self.inner_attn = inner_attn_cls(causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout)
+        self.inner_cross_attn = inner_cross_attn_cls(
+            causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
+        )
+        # output projection always have the bias (for now)
+        self.out_proj = RowParallelLinearTorch(
+            embed_dim,
+            embed_dim,
+            process_group,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            **factory_kwargs,
+        )
+        # need to assign tp attribute so that internlm know it is tensor parallel module
+        if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+            for name in ["out_proj", "Wqkv"]:
+                for param in getattr(self, name).parameters():
+                    setattr(param, IS_TENSOR_PARALLEL, True)
+    def forward(self, x, seqlen=None, inference_params=None, **kwargs):
+        if kwargs.get("indexes", None) is not None:
+            return self._packed_forward(x=x, inference_params=inference_params, **kwargs)
+        else:
+            return self._forward(x=x, seqlen=seqlen, inference_params=inference_params, **kwargs)
+    def _forward(self, x, seqlen=None, inference_params=None, **kwargs):
+        """
+        Arguments:
+            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if seqlen=None.
+                If seqlen is not None, x is (batch * seqlen, hidden_dim). This is so that when we
+                split x during sequence parallel, we split the batch * seqlen dimension
+                (in case batch is small).
+        """
+        qkv = self.Wqkv(x)
+        if seqlen is None:
+            qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, d=self.head_dim)
+        else:
+            qkv = rearrange(qkv, "(b s) (three h d) -> b s three h d", s=seqlen, three=3, d=self.head_dim)
+        if self.rotary_emb_dim > 0:
+            kwargs["inference_params"] = inference_params
+            qkv = self.rotary_emb(qkv, **kwargs)
+        if inference_params is None:
+            if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
+                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                    if qkv.dtype not in [torch.float16, torch.bfloat16]:
+                        qkv = qkv.to(torch.bfloat16)
+                    context = self.inner_attn(qkv).to(x.dtype)
+            else:
+                context = self.inner_attn(qkv)
+        else:
+            q = qkv[:, :, 0]
+            assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
+            kv = _update_kv_cache(qkv[:, :, 1:], inference_params, self.layer_idx)
+            # If we're processing the prompt, causal=None (use self.causal).
+            # If we're decoding, then causal=False.
+            causal = None if inference_params.sequence_len_offset == 0 else False
+            context = self.inner_cross_attn(q, kv, causal=causal)
+        if seqlen is None:
+            context = rearrange(context, "b s h d -> b s (h d)")
+        else:
+            context = rearrange(context, "b s h d -> (b s) (h d)")
+        out = self.out_proj(context)
+        return out
+    def _packed_forward(self, x, inference_params=None, **kwargs):
+        """
+        Arguments:
+            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if seqlen=None.
+                If seqlen is not None, x is (batch * seqlen, hidden_dim). This is so that when we
+                split x during sequence parallel, we split the batch * seqlen dimension
+                (in case batch is small).
+        """
+        qkv = self.Wqkv(x)  # total x hsz'
+        qkv = rearrange(qkv, "t (three h d) -> t three h d", three=3, d=self.head_dim)  # total x 3 x n_head x d
+        qkv = self.rotary_emb(qkv, **kwargs)
+        kwargs.pop("indexes")
+        if inference_params is None:
+            if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
+                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                    if qkv.dtype not in [torch.float16, torch.bfloat16]:
+                        qkv = qkv.to(torch.bfloat16)
+                    context = self.inner_attn(qkv, **kwargs).to(x.dtype)
+            else:
+                context = self.inner_attn(qkv, **kwargs)
+        else:
+            raise RuntimeError("Not support this right now")
+        context = rearrange(context, "b h d -> b (h d)")  # recover the shape
+        out = self.out_proj(context)
+        return out

InternLM/internlm/model/muse/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = "0.0.1"
+from .modeling_taming_vqgan import VQGANModel

InternLM/internlm/model/muse/modeling_taming_vqgan.py ADDED Viewed

	@@ -0,0 +1,591 @@

+# coding=utf-8
+# Copyright 2023 The Taming Transformers Authors and The HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from functools import partial
+from typing import Tuple
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from .modeling_utils import ConfigMixin, ModelMixin, register_to_config
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int, with_conv: bool):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = nn.Conv2d(
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            )
+    def forward(self, hidden_states):
+        hidden_states = torch.nn.functional.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            hidden_states = self.conv(hidden_states)
+        return hidden_states
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int, with_conv: bool):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, hidden_states):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)  # pad height and width dim
+            hidden_states = torch.nn.functional.pad(hidden_states, pad, mode="constant", value=0)
+            hidden_states = self.conv(hidden_states)
+        else:
+            hidden_states = torch.nn.functional.avg_pool2d(hidden_states, kernel_size=2, stride=2)
+        return hidden_states
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int = None,
+        use_conv_shortcut: bool = False,
+        dropout_prob: float = 0.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.out_channels_ = self.in_channels if self.out_channels is None else self.out_channels
+        self.use_conv_shortcut = use_conv_shortcut
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(
+            self.in_channels,
+            self.out_channels_,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=self.out_channels_, eps=1e-6, affine=True)
+        self.dropout = nn.Dropout(dropout_prob)
+        self.conv2 = nn.Conv2d(
+            self.out_channels_,
+            self.out_channels_,
+            kernel_size=3,
+            stride=(1, 1),
+            padding=1,
+        )
+        if self.in_channels != self.out_channels_:
+            if use_conv_shortcut:
+                self.conv_shortcut = nn.Conv2d(
+                    self.in_channels,
+                    self.out_channels_,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                )
+            else:
+                self.nin_shortcut = nn.Conv2d(
+                    self.in_channels,
+                    self.out_channels_,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = F.silu(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = F.silu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.in_channels != self.out_channels_:
+            if self.use_conv_shortcut:
+                residual = self.conv_shortcut(residual)
+            else:
+                residual = self.nin_shortcut(residual)
+        return hidden_states + residual
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        conv = partial(nn.Conv2d, self.in_channels, self.in_channels, kernel_size=1, stride=1, padding=0)
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=self.in_channels, eps=1e-6, affine=True)
+        self.q, self.k, self.v = conv(), conv(), conv()
+        self.proj_out = conv()
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        query = self.q(hidden_states)
+        key = self.k(hidden_states)
+        value = self.v(hidden_states)
+        # compute attentions
+        batch, channels, height, width = query.shape
+        query = query.reshape((batch, channels, height * width))
+        query = query.permute(0, 2, 1)  # (b, hw, c)
+        key = key.reshape((batch, channels, height * width))
+        attn_weights = torch.bmm(query, key)  # b,hw,hw
+        attn_weights = attn_weights * (int(channels) ** -0.5)
+        attn_weights = nn.functional.softmax(attn_weights, dim=2)
+        # attend to values
+        value = value.reshape((batch, channels, height * width))
+        attn_weights = attn_weights.permute(0, 2, 1)
+        hidden_states = torch.bmm(value, attn_weights)
+        hidden_states = hidden_states.reshape((batch, channels, height, width))
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states + residual
+        return hidden_states
+class UpsamplingBlock(nn.Module):
+    def __init__(self, config, curr_res: int, block_idx: int):
+        super().__init__()
+        self.config = config
+        self.block_idx = block_idx
+        self.curr_res = curr_res
+        if self.block_idx == self.config.num_resolutions - 1:
+            block_in = self.config.hidden_channels * self.config.channel_mult[-1]
+        else:
+            block_in = self.config.hidden_channels * self.config.channel_mult[self.block_idx + 1]
+        block_out = self.config.hidden_channels * self.config.channel_mult[self.block_idx]
+        res_blocks = []
+        attn_blocks = []
+        for _ in range(self.config.num_res_blocks + 1):
+            res_blocks.append(ResnetBlock(block_in, block_out, dropout_prob=self.config.dropout))
+            block_in = block_out
+            if self.curr_res in self.config.attn_resolutions:
+                attn_blocks.append(AttnBlock(block_in))
+        self.block = nn.ModuleList(res_blocks)
+        self.attn = nn.ModuleList(attn_blocks)
+        self.upsample = None
+        if self.block_idx != 0:
+            self.upsample = Upsample(block_in, self.config.resample_with_conv)
+    def forward(self, hidden_states):
+        for i, res_block in enumerate(self.block):
+            hidden_states = res_block(hidden_states)
+            if len(self.attn) > 1:
+                hidden_states = self.attn[i](hidden_states)
+        if self.upsample is not None:
+            hidden_states = self.upsample(hidden_states)
+        return hidden_states
+class DownsamplingBlock(nn.Module):
+    def __init__(self, config, curr_res: int, block_idx: int):
+        super().__init__()
+        self.config = config
+        self.curr_res = curr_res
+        self.block_idx = block_idx
+        in_channel_mult = (1,) + tuple(self.config.channel_mult)
+        block_in = self.config.hidden_channels * in_channel_mult[self.block_idx]
+        block_out = self.config.hidden_channels * self.config.channel_mult[self.block_idx]
+        res_blocks = nn.ModuleList()
+        attn_blocks = nn.ModuleList()
+        for _ in range(self.config.num_res_blocks):
+            res_blocks.append(ResnetBlock(block_in, block_out, dropout_prob=self.config.dropout))
+            block_in = block_out
+            if self.curr_res in self.config.attn_resolutions:
+                attn_blocks.append(AttnBlock(block_in))
+        self.block = res_blocks
+        self.attn = attn_blocks
+        self.downsample = None
+        if self.block_idx != self.config.num_resolutions - 1:
+            self.downsample = Downsample(block_in, self.config.resample_with_conv)
+    def forward(self, hidden_states):
+        for i, res_block in enumerate(self.block):
+            hidden_states = res_block(hidden_states)
+            if len(self.attn) > 1:
+                hidden_states = self.attn[i](hidden_states)
+        if self.downsample is not None:
+            hidden_states = self.downsample(hidden_states)
+        return hidden_states
+class MidBlock(nn.Module):
+    def __init__(self, config, in_channels: int, no_attn: False, dropout: float):
+        super().__init__()
+        self.config = config
+        self.in_channels = in_channels
+        self.no_attn = no_attn
+        self.dropout = dropout
+        self.block_1 = ResnetBlock(
+            self.in_channels,
+            self.in_channels,
+            dropout_prob=self.dropout,
+        )
+        if not no_attn:
+            self.attn_1 = AttnBlock(self.in_channels)
+        self.block_2 = ResnetBlock(
+            self.in_channels,
+            self.in_channels,
+            dropout_prob=self.dropout,
+        )
+    def forward(self, hidden_states):
+        hidden_states = self.block_1(hidden_states)
+        if not self.no_attn:
+            hidden_states = self.attn_1(hidden_states)
+        hidden_states = self.block_2(hidden_states)
+        return hidden_states
+class Encoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # downsampling
+        self.conv_in = nn.Conv2d(
+            self.config.num_channels,
+            self.config.hidden_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        curr_res = self.config.resolution
+        downsample_blocks = []
+        for i_level in range(self.config.num_resolutions):
+            downsample_blocks.append(DownsamplingBlock(self.config, curr_res, block_idx=i_level))
+            if i_level != self.config.num_resolutions - 1:
+                curr_res = curr_res // 2
+        self.down = nn.ModuleList(downsample_blocks)
+        # middle
+        mid_channels = self.config.hidden_channels * self.config.channel_mult[-1]
+        self.mid = MidBlock(config, mid_channels, self.config.no_attn_mid_block, self.config.dropout)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=mid_channels, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(
+            mid_channels,
+            self.config.z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, pixel_values):
+        # downsampling
+        hidden_states = self.conv_in(pixel_values)
+        for block in self.down:
+            hidden_states = block(hidden_states)
+        # middle
+        hidden_states = self.mid(hidden_states)
+        # end
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = F.silu(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+class Decoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # compute in_channel_mult, block_in and curr_res at lowest res
+        block_in = self.config.hidden_channels * self.config.channel_mult[self.config.num_resolutions - 1]
+        curr_res = self.config.resolution // 2 ** (self.config.num_resolutions - 1)
+        self.z_shape = (1, self.config.z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(
+            self.config.z_channels,
+            block_in,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        # middle
+        self.mid = MidBlock(config, block_in, self.config.no_attn_mid_block, self.config.dropout)
+        # upsampling
+        upsample_blocks = []
+        for i_level in reversed(range(self.config.num_resolutions)):
+            upsample_blocks.append(UpsamplingBlock(self.config, curr_res, block_idx=i_level))
+            if i_level != 0:
+                curr_res = curr_res * 2
+        self.up = nn.ModuleList(list(reversed(upsample_blocks)))  # reverse to get consistent order
+        # end
+        block_out = self.config.hidden_channels * self.config.channel_mult[0]
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_out, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(
+            block_out,
+            self.config.num_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, hidden_states):
+        # z to block_in
+        hidden_states = self.conv_in(hidden_states)
+        # middle
+        hidden_states = self.mid(hidden_states)
+        # upsampling
+        for block in reversed(self.up):
+            hidden_states = block(hidden_states)
+        # end
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = F.silu(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+class VectorQuantizer(nn.Module):
+    """
+    see https://github.com/MishaLaskin/vqvae/blob/d761a999e2267766400dc646d82d3ac3657771d4/models/quantizer.py
+    Discretization bottleneck part of the VQ-VAE.
+    """
+    def __init__(self, num_embeddings, embedding_dim, commitment_cost):
+        r"""
+        Args:
+            num_embeddings: number of vectors in the quantized space.
+            embedding_dim: dimensionality of the tensors in the quantized space.
+                Inputs to the modules must be in this format as well.
+            commitment_cost: scalar which controls the weighting of the loss terms
+                (see equation 4 in the paper https://arxiv.org/abs/1711.00937 - this variable is Beta).
+        """
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.commitment_cost = commitment_cost
+        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
+        self.embedding.weight.data.uniform_(-1.0 / num_embeddings, 1.0 / num_embeddings)
+    def forward(self, hidden_states, return_loss=False):
+        """
+        Inputs the output of the encoder network z and maps it to a discrete one-hot vector that is the index of the
+        closest embedding vector e_j z (continuous) -> z_q (discrete) z.shape = (batch, channel, height, width)
+        quantization pipeline:
+            1. get encoder input (B,C,H,W)
+            2. flatten input to (B*H*W,C)
+        """
+        # reshape z -> (batch, height, width, channel) and flatten
+        hidden_states = hidden_states.permute(0, 2, 3, 1).contiguous()
+        distances = self.compute_distances(hidden_states)
+        min_encoding_indices = torch.argmin(distances, axis=1).unsqueeze(1)
+        min_encodings = torch.zeros(min_encoding_indices.shape[0], self.num_embeddings).to(hidden_states)
+        min_encodings.scatter_(1, min_encoding_indices, 1)
+        # get quantized latent vectors
+        z_q = torch.matmul(min_encodings, self.embedding.weight).view(hidden_states.shape)
+        # reshape to (batch, num_tokens)
+        min_encoding_indices = min_encoding_indices.reshape(hidden_states.shape[0], -1)
+        # compute loss for embedding
+        loss = None
+        if return_loss:
+            loss = torch.mean((z_q.detach() - hidden_states) ** 2) + self.commitment_cost * torch.mean(
+                (z_q - hidden_states.detach()) ** 2
+            )
+            # preserve gradients
+            z_q = hidden_states + (z_q - hidden_states).detach()
+        # reshape back to match original input shape
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q, min_encoding_indices, loss
+    def compute_distances(self, hidden_states):
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        hidden_states_flattended = hidden_states.reshape((-1, self.embedding_dim))
+        emb_weights = self.embedding.weight.t()
+        inputs_norm_sq = hidden_states_flattended.pow(2.0).sum(dim=1, keepdim=True)
+        codebook_t_norm_sq = emb_weights.pow(2.0).sum(dim=0, keepdim=True)
+        distances = torch.addmm(
+            inputs_norm_sq + codebook_t_norm_sq,
+            hidden_states_flattended,
+            emb_weights,
+            alpha=-2.0,
+        )
+        return distances
+    def get_codebook_entry(self, indices):
+        # indices are expected to be of shape (batch, num_tokens)
+        # get quantized latent vectors
+        batch, num_tokens = indices.shape
+        z_q = self.embedding(indices)
+        z_q = z_q.reshape(batch, int(math.sqrt(num_tokens)), int(math.sqrt(num_tokens)), -1).permute(0, 3, 1, 2)
+        return z_q
+    def get_codebook_entry_for_lvm(self, indices):
+        batch, num_tokens = indices.shape
+        z_q = self.embedding(indices)
+        z_q = z_q.reshape(batch, num_tokens, -1)
+        return z_q
+    # adapted from https://github.com/kakaobrain/rq-vae-transformer/blob/main/rqvae/models/rqvae/quantizations.py#L372
+    def get_soft_code(self, hidden_states, temp=1.0, stochastic=False):
+        hidden_states = hidden_states.permute(0, 2, 3, 1).contiguous()  # (batch, height, width, channel)
+        distances = self.compute_distances(hidden_states)  # (batch * height * width, num_embeddings)
+        soft_code = F.softmax(-distances / temp, dim=-1)  # (batch * height * width, num_embeddings)
+        if stochastic:
+            code = torch.multinomial(soft_code, 1)  # (batch * height * width, 1)
+        else:
+            code = distances.argmin(dim=-1)  # (batch * height * width)
+        code = code.reshape(hidden_states.shape[0], -1)  # (batch, height * width)
+        batch, num_tokens = code.shape
+        soft_code = soft_code.reshape(batch, num_tokens, -1)  # (batch, height * width, num_embeddings)
+        return soft_code, code
+    def get_code(self, hidden_states):
+        # reshape z -> (batch, height, width, channel)
+        hidden_states = hidden_states.permute(0, 2, 3, 1).contiguous()
+        distances = self.compute_distances(hidden_states)
+        indices = torch.argmin(distances, axis=1).unsqueeze(1)
+        indices = indices.reshape(hidden_states.shape[0], -1)
+        return indices
+class VQGANModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        resolution: int = 256,
+        num_channels: int = 3,
+        hidden_channels: int = 128,
+        channel_mult: Tuple = (1, 1, 2, 2, 4),
+        num_res_blocks: int = 2,
+        attn_resolutions: int = (16,),
+        no_attn_mid_block: bool = False,
+        z_channels: int = 256,
+        num_embeddings: int = 1024,
+        quantized_embed_dim: int = 256,
+        dropout: float = 0.0,
+        resample_with_conv: bool = True,
+        commitment_cost: float = 0.25,
+    ):
+        super().__init__()
+        self.config.num_resolutions = len(channel_mult)
+        self.config.reduction_factor = 2 ** (self.config.num_resolutions - 1)
+        self.config.latent_size = resolution // self.config.reduction_factor
+        self.encoder = Encoder(self.config)
+        self.decoder = Decoder(self.config)
+        self.quantize = VectorQuantizer(
+            self.config.num_embeddings, self.config.quantized_embed_dim, self.config.commitment_cost
+        )
+        self.quant_conv = nn.Conv2d(
+            self.config.z_channels,
+            self.config.quantized_embed_dim,
+            kernel_size=1,
+        )
+        self.post_quant_conv = nn.Conv2d(
+            self.config.quantized_embed_dim,
+            self.config.z_channels,
+            kernel_size=1,
+        )
+    def encode(self, pixel_values, return_loss=False):
+        hidden_states = self.encoder(pixel_values)
+        hidden_states = self.quant_conv(hidden_states)
+        quantized_states, codebook_indices, codebook_loss = self.quantize(hidden_states, return_loss)
+        output = (quantized_states, codebook_indices)
+        if return_loss:
+            output = output + (codebook_loss,)
+        return output
+    def decode(self, quantized_states):
+        hidden_states = self.post_quant_conv(quantized_states)
+        reconstructed_pixel_values = self.decoder(hidden_states)
+        return reconstructed_pixel_values
+    def decode_code(self, codebook_indices):
+        quantized_states = self.quantize.get_codebook_entry(codebook_indices)
+        reconstructed_pixel_values = self.decode(quantized_states)
+        return reconstructed_pixel_values
+    def get_code(self, pixel_values):
+        hidden_states = self.encoder(pixel_values)
+        hidden_states = self.quant_conv(hidden_states)
+        codebook_indices = self.quantize.get_code(hidden_states)
+        return codebook_indices
+    def forward(self, pixel_values, return_loss=False):
+        hidden_states = self.encoder(pixel_values)
+        hidden_states = self.quant_conv(hidden_states)
+        quantized_states, codebook_indices, codebook_loss = self.quantize(hidden_states, return_loss)
+        reconstructed_pixel_values = self.decode(quantized_states)
+        outputs = (reconstructed_pixel_values, quantized_states, codebook_indices)
+        if return_loss:
+            outputs = outputs + (codebook_loss,)
+        return outputs

InternLM/internlm/model/muse/modeling_utils.py ADDED Viewed

	@@ -0,0 +1,1171 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import inspect
+import json
+import os
+from collections import OrderedDict
+from functools import partial
+from pathlib import PosixPath
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import accelerate
+import numpy as np
+import torch
+from accelerate.utils import set_module_tensor_to_device
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
+from requests import HTTPError
+from torch import Tensor, device
+from . import __version__
+from internlm.utils.logger import get_logger
+logger = get_logger(__file__)
+hf_cache_home = os.path.expanduser(
+    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+)
+default_cache_path = os.path.join(hf_cache_home, "muse")
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+SAFETENSORS_WEIGHTS_NAME = "pytorch_model.safetensors"
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = "https://huggingface.co"
+MUSE_CACHE = default_cache_path
+MUSE_DYNAMIC_MODULE_NAME = "myse_modules"
+HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
+_LOW_CPU_MEM_USAGE_DEFAULT = True
+def get_parameter_device(parameter: torch.nn.Module):
+    try:
+        return next(parameter.parameters()).device
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].device
+def get_parameter_dtype(parameter: torch.nn.Module):
+    try:
+        return next(parameter.parameters()).dtype
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].dtype
+def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
+    """
+    Reads a checkpoint file, returning properly formatted errors if they arise.
+    """
+    try:
+        if os.path.basename(checkpoint_file) == WEIGHTS_NAME:
+            return torch.load(checkpoint_file, map_location="cpu")
+    except Exception as e:
+        try:
+            with open(checkpoint_file) as f:
+                if f.read().startswith("version"):
+                    raise OSError(
+                        "You seem to have cloned a repository without having git-lfs installed. Please install "
+                        "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+                        "you cloned."
+                    )
+                else:
+                    raise ValueError(
+                        f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
+                        "model. Make sure you have saved the model properly."
+                    ) from e
+        except (UnicodeDecodeError, ValueError):
+            raise OSError(
+                f"Unable to load weights from checkpoint file for '{checkpoint_file}' "
+                f"at '{checkpoint_file}'. "
+                "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
+            )
+def _load_state_dict_into_model(model_to_load, state_dict):
+    # Convert old format to new format if needed from a PyTorch state_dict
+    # copy state_dict so _load_from_state_dict can modify it
+    state_dict = state_dict.copy()
+    error_msgs = []
+    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+    # so we need to apply the function recursively.
+    def load(module: torch.nn.Module, prefix=""):
+        args = (state_dict, prefix, {}, True, [], [], error_msgs)
+        module._load_from_state_dict(*args)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + ".")
+    load(model_to_load)
+    return error_msgs
+def _get_model_file(
+    pretrained_model_name_or_path,
+    *,
+    weights_name,
+    subfolder,
+    cache_dir,
+    force_download,
+    proxies,
+    resume_download,
+    local_files_only,
+    use_auth_token,
+    user_agent,
+    revision,
+):
+    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+    if os.path.isfile(pretrained_model_name_or_path):
+        return pretrained_model_name_or_path
+    elif os.path.isdir(pretrained_model_name_or_path):
+        if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)):
+            # Load from a PyTorch checkpoint
+            model_file = os.path.join(pretrained_model_name_or_path, weights_name)
+            return model_file
+        elif subfolder is not None and os.path.isfile(
+            os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+        ):
+            model_file = os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+            return model_file
+        else:
+            raise EnvironmentError(
+                f"Error no file named {weights_name} found in directory {pretrained_model_name_or_path}."
+            )
+    else:
+        try:
+            # Load from URL or cache if already cached
+            model_file = hf_hub_download(
+                pretrained_model_name_or_path,
+                filename=weights_name,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                user_agent=user_agent,
+                subfolder=subfolder,
+                revision=revision,
+            )
+            return model_file
+        except RepositoryNotFoundError:
+            raise EnvironmentError(
+                f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
+                "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
+                "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
+                "login`."
+            )
+        except RevisionNotFoundError:
+            raise EnvironmentError(
+                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
+                "this model name. Check the model page at "
+                f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+            )
+        except EntryNotFoundError:
+            raise EnvironmentError(
+                f"{pretrained_model_name_or_path} does not appear to have a file named {weights_name}."
+            )
+        except HTTPError as err:
+            raise EnvironmentError(
+                f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}"
+            )
+        except ValueError:
+            raise EnvironmentError(
+                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+                f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                f" directory containing a file named {weights_name} or"
+                " \nCheckout your internet connection or see how to run the library in"
+                " offline mode at 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+            )
+        except EnvironmentError:
+            raise EnvironmentError(
+                f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                f"containing a file named {weights_name}"
+            )
+class ModelMixin(torch.nn.Module):
+    r"""
+    Base class for all models.
+    [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading, downloading
+    and saving models.
+        - **config_name** ([`str`]) -- A filename under which the model should be stored when calling
+          [`~models.ModelMixin.save_pretrained`].
+    """
+    config_name = CONFIG_NAME
+    _automatically_saved_args = ["_version", "_class_name", "_name_or_path"]
+    _supports_gradient_checkpointing = False
+    def __init__(self):
+        super().__init__()
+    @property
+    def is_gradient_checkpointing(self) -> bool:
+        """
+        Whether gradient checkpointing is activated for this model or not.
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
+    def enable_gradient_checkpointing(self):
+        """
+        Activates gradient checkpointing for the current model.
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if not self._supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+    def disable_gradient_checkpointing(self):
+        """
+        Deactivates gradient checkpointing for the current model.
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if self._supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+    def set_use_memory_efficient_attention_xformers(
+        self, valid: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_mem_eff(module)
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        Parameters:
+            attention_op (`Callable`, *optional*):
+                Override the default `None` operator for use as `op` argument to the
+                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
+                function of xFormers.
+        Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import UNet2DConditionModel
+        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+        >>> model = UNet2DConditionModel.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
+        ... )
+        >>> model = model.to("cuda")
+        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+        ```
+        """
+        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.set_use_memory_efficient_attention_xformers(False)
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Callable = None,
+        state_dict: Optional[Dict[str, torch.Tensor]] = None,
+    ):
+        """
+        Save a model and its configuration file to a directory, so that it can be re-loaded using the
+        `[`~models.ModelMixin.from_pretrained`]` class method.
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful when in distributed training like
+                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
+                the main process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
+                need to replace `torch.save` by another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            state_dict (`Dict[str, torch.Tensor]`, *optional*):
+                The state dictionary to save. If `None`, the model's state dictionary will be saved.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+        if save_function is None:
+            save_function = torch.save
+        os.makedirs(save_directory, exist_ok=True)
+        model_to_save = self
+        # Attach architecture to the config
+        # Save the config
+        if is_main_process:
+            model_to_save.save_config(save_directory)
+        # Save the model
+        if state_dict is None:
+            state_dict = model_to_save.state_dict()
+        weights_name = WEIGHTS_NAME
+        # Save the model
+        save_function(state_dict, os.path.join(save_directory, weights_name))
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained pytorch model from a pre-trained model configuration.
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you should first set it back in training mode with `model.train()`.
+        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+        task.
+        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+        weights are discarded.
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
+                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
+                      `./my_model_directory/`.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+                will be automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `diffusers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            from_flax (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a Flax checkpoint save file.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo (either remote in
+                huggingface.co or downloaded locally), you can specify the folder name here.
+            mirror (`str`, *optional*):
+                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+                Please refer to the mirror site for more information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device.
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+                setting this argument to `True` will raise an error.
+        <Tip>
+         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+         models](https://huggingface.co/docs/hub/models-gated#gated-models).
+        </Tip>
+        <Tip>
+        Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
+        this method in a firewalled environment.
+        </Tip>
+        """
+        cache_dir = kwargs.pop("cache_dir", MUSE_CACHE)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", False)  # TODO
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        subfolder = kwargs.pop("subfolder", None)
+        device_map = kwargs.pop("device_map", None)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+        # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
+        # Load model
+        model_file = None
+        if model_file is None:
+            model_file = _get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=WEIGHTS_NAME,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+            )
+        if low_cpu_mem_usage:
+            # Instantiate model with empty weights
+            with accelerate.init_empty_weights():
+                config, unused_kwargs = cls.load_config(
+                    config_path,
+                    cache_dir=cache_dir,
+                    return_unused_kwargs=True,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    device_map=device_map,
+                    **kwargs,
+                )
+                model = cls.from_config(config, **unused_kwargs)
+            # if device_map is None, load the state dict and move the params from meta device to the cpu
+            if device_map is None:
+                param_device = "cpu"
+                state_dict = load_state_dict(model_file)
+                # move the params from meta device to cpu
+                missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+                if len(missing_keys) > 0:
+                    raise ValueError(
+                        f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
+                        f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
+                        " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomely initialize"
+                        " those weights or else make sure your checkpoint file is correct."
+                    )
+                for param_name, param in state_dict.items():
+                    accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
+                    if accepts_dtype:
+                        set_module_tensor_to_device(model, param_name, param_device, value=param, dtype=torch_dtype)
+                    else:
+                        set_module_tensor_to_device(model, param_name, param_device, value=param)
+            else:  # else let accelerate handle loading and dispatching.
+                # Load weights and dispatch according to the device_map
+                # by deafult the device_map is None and the weights are loaded on the CPU
+                accelerate.load_checkpoint_and_dispatch(model, model_file, device_map, dtype=torch_dtype)
+            loading_info = {
+                "missing_keys": [],
+                "unexpected_keys": [],
+                "mismatched_keys": [],
+                "error_msgs": [],
+            }
+        else:
+            config, unused_kwargs = cls.load_config(
+                config_path,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                device_map=device_map,
+                **kwargs,
+            )
+            model = cls.from_config(config, **unused_kwargs)
+            state_dict = load_state_dict(model_file)
+            model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+                model,
+                state_dict,
+                model_file,
+                pretrained_model_name_or_path,
+                ignore_mismatched_sizes=ignore_mismatched_sizes,
+            )
+            loading_info = {
+                "missing_keys": missing_keys,
+                "unexpected_keys": unexpected_keys,
+                "mismatched_keys": mismatched_keys,
+                "error_msgs": error_msgs,
+            }
+        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(
+                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+            )
+        elif torch_dtype is not None:
+            model = model.to(torch_dtype)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+        return model
+    @classmethod
+    def _load_pretrained_model(
+        cls,
+        model,
+        state_dict,
+        resolved_archive_file,
+        pretrained_model_name_or_path,
+        ignore_mismatched_sizes=False,
+    ):
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        loaded_keys = [k for k in state_dict.keys()]
+        expected_keys = list(model_state_dict.keys())
+        original_loaded_keys = loaded_keys
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+        # Make sure we are able to load base models as well as derived models (with heads)
+        model_to_load = model
+        def _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
+        ):
+            mismatched_keys = []
+            if ignore_mismatched_sizes:
+                for checkpoint_key in loaded_keys:
+                    model_key = checkpoint_key
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+            return mismatched_keys
+        if state_dict is not None:
+            # Whole checkpoint
+            mismatched_keys = _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
+            )
+            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if "size mismatch" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+    @property
+    def device(self) -> device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        return get_parameter_device(self)
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
+        """
+        Get number of (optionally, trainable or non-embeddings) parameters in the module.
+        Args:
+            only_trainable (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of trainable parameters
+            exclude_embeddings (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of non-embeddings parameters
+        Returns:
+            `int`: The number of parameters.
+        """
+        if exclude_embeddings:
+            embedding_param_names = [
+                f"{name}.weight"
+                for name, module_type in self.named_modules()
+                if isinstance(module_type, torch.nn.Embedding)
+            ]
+            non_embedding_parameters = [
+                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
+            ]
+            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
+        else:
+            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+""" ConfigMixin base class and utilities."""
+class FrozenDict(OrderedDict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        for key, value in self.items():
+            setattr(self, key, value)
+        self.__frozen = True
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+    def __setattr__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+        super().__setattr__(name, value)
+    def __setitem__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+        super().__setitem__(name, value)
+class ConfigMixin:
+    r"""
+    Base class for all configuration classes. Stores all configuration parameters under `self.config` Also handles all
+    methods for loading/downloading/saving classes inheriting from [`ConfigMixin`] with
+        - [`~ConfigMixin.from_config`]
+        - [`~ConfigMixin.save_config`]
+    Class attributes:
+        - **config_name** (`str`) -- A filename under which the config should stored when calling
+          [`~ConfigMixin.save_config`] (should be overridden by parent class).
+        - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
+          overridden by subclass).
+        - **has_compatibles** (`bool`) -- Whether the class has compatible classes (should be overridden by subclass).
+        - **_deprecated_kwargs** (`List[str]`) -- Keyword arguments that are deprecated. Note that the init function
+          should only have a `kwargs` argument if at least one argument is deprecated (should be overridden by
+          subclass).
+    """
+    config_name = None
+    ignore_for_config = []
+    has_compatibles = False
+    _deprecated_kwargs = []
+    def register_to_config(self, **kwargs):
+        if self.config_name is None:
+            raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
+        # Special case for `kwargs` used in deprecation warning added to schedulers
+        # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
+        # or solve in a more general way.
+        kwargs.pop("kwargs", None)
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+        if not hasattr(self, "_internal_dict"):
+            internal_dict = kwargs
+        else:
+            previous_dict = dict(self._internal_dict)
+            internal_dict = {**self._internal_dict, **kwargs}
+            logger.debug(f"Updating config from {previous_dict} to {internal_dict}")
+        self._internal_dict = FrozenDict(internal_dict)
+    def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~ConfigMixin.from_config`] class method.
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+        os.makedirs(save_directory, exist_ok=True)
+        # If we save using the predefined names, we can load using `from_config`
+        output_config_file = os.path.join(save_directory, self.config_name)
+        self.to_json_file(output_config_file)
+        logger.info(f"Configuration saved in {output_config_file}")
+    @classmethod
+    def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, **kwargs):
+        r"""
+        Instantiate a Python class from a config dictionary
+        Parameters:
+            config (`Dict[str, Any]`):
+                A config dictionary from which the Python class will be instantiated. Make sure to only load
+                configuration files of compatible classes.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the Python class.
+                `**kwargs` will be directly passed to the underlying scheduler/model's `__init__` method and eventually
+                overwrite same named arguments of `config`.
+        Examples:
+        ```python
+        >>> from diffusers import DDPMScheduler, DDIMScheduler, PNDMScheduler
+        >>> # Download scheduler from huggingface.co and cache.
+        >>> scheduler = DDPMScheduler.from_pretrained("google/ddpm-cifar10-32")
+        >>> # Instantiate DDIM scheduler class with same config as DDPM
+        >>> scheduler = DDIMScheduler.from_config(scheduler.config)
+        >>> # Instantiate PNDM scheduler class with same config as DDPM
+        >>> scheduler = PNDMScheduler.from_config(scheduler.config)
+        ```
+        """
+        # <===== TO BE REMOVED WITH DEPRECATION
+        # TODO(Patrick) - make sure to remove the following lines when config=="model_path" is deprecated
+        if "pretrained_model_name_or_path" in kwargs:
+            config = kwargs.pop("pretrained_model_name_or_path")
+        if config is None:
+            raise ValueError("Please make sure to provide a config as the first positional argument.")
+        # ======>
+        # Return model and optionally state and/or unused_kwargs
+        model = cls(**config)
+        return model
+    @classmethod
+    def load_config(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        r"""
+        Instantiate a Python class from a config dictionary
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
+                      organization name, like `google/ddpm-celebahq-256`.
+                    - A path to a *directory* containing model weights saved using [`~ConfigMixin.save_config`], e.g.,
+                      `./my_model_directory/`.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo (either remote in
+                huggingface.co or downloaded locally), you can specify the folder name here.
+        <Tip>
+         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+         models](https://huggingface.co/docs/hub/models-gated#gated-models).
+        </Tip>
+        <Tip>
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
+        use this method in a firewalled environment.
+        </Tip>
+        """
+        cache_dir = kwargs.pop("cache_dir", MUSE_CACHE)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        _ = kwargs.pop("mirror", None)
+        subfolder = kwargs.pop("subfolder", None)
+        user_agent = {"file_type": "config"}
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        if cls.config_name is None:
+            raise ValueError(
+                "`self.config_name` is not defined. Note that one should not load a config from "
+                "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
+            )
+        if os.path.isfile(pretrained_model_name_or_path):
+            config_file = pretrained_model_name_or_path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
+                # Load from a PyTorch checkpoint
+                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
+            elif subfolder is not None and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+            ):
+                config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+            else:
+                raise EnvironmentError(
+                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
+                )
+        else:
+            try:
+                # Load from URL or cache if already cached
+                config_file = hf_hub_download(
+                    pretrained_model_name_or_path,
+                    filename=cls.config_name,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    user_agent=user_agent,
+                    subfolder=subfolder,
+                    revision=revision,
+                )
+            except RepositoryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
+                    " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
+                    " token having permission to this repo with `use_auth_token` or log in with `huggingface-cli"
+                    " login`."
+                )
+            except RevisionNotFoundError:
+                raise EnvironmentError(
+                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for"
+                    " this model name. Check the model page at"
+                    f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+                )
+            except EntryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
+                )
+            except HTTPError as err:
+                raise EnvironmentError(
+                    "There was a specific connection error when trying to load"
+                    f" {pretrained_model_name_or_path}:\n{err}"
+                )
+            except ValueError:
+                raise EnvironmentError(
+                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                    f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to"
+                    " run the library in offline mode at"
+                    " 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+                )
+            except EnvironmentError:
+                raise EnvironmentError(
+                    f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                    f"containing a {cls.config_name} file"
+                )
+        try:
+            # Load config dict
+            config_dict = cls._dict_from_json_file(config_file)
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
+        if return_unused_kwargs:
+            return config_dict, kwargs
+        return config_dict
+    @staticmethod
+    def _get_init_keys(cls):
+        return set(dict(inspect.signature(cls.__init__).parameters).keys())
+    @classmethod
+    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return json.loads(text)
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+    @property
+    def config(self) -> Dict[str, Any]:
+        """
+        Returns the config of the class as a frozen dictionary
+        Returns:
+            `Dict[str, Any]`: Config of the class.
+        """
+        return self._internal_dict
+    def to_json_string(self) -> str:
+        """
+        Serializes this instance to a JSON string.
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {}
+        config_dict["_class_name"] = self.__class__.__name__
+        config_dict["_version"] = __version__
+        def to_json_saveable(value):
+            if isinstance(value, np.ndarray):
+                value = value.tolist()
+            elif isinstance(value, PosixPath):
+                value = str(value)
+            return value
+        config_dict = {k: to_json_saveable(v) for k, v in config_dict.items()}
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+def register_to_config(init):
+    r"""
+    Decorator to apply on the init of classes inheriting from [`ConfigMixin`] so that all the arguments are
+    automatically sent to `self.register_for_config`. To ignore a specific argument accepted by the init but that
+    shouldn't be registered in the config, use the `ignore_for_config` class variable
+    Warning: Once decorated, all private arguments (beginning with an underscore) are trashed and not sent to the init!
+    """
+    @functools.wraps(init)
+    def inner_init(self, *args, **kwargs):
+        # Ignore private kwargs in the init.
+        init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
+        config_init_kwargs = {k: v for k, v in kwargs.items() if k.startswith("_")}
+        if not isinstance(self, ConfigMixin):
+            raise RuntimeError(
+                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
+                "not inherit from `ConfigMixin`."
+            )
+        ignore = getattr(self, "ignore_for_config", [])
+        # Get positional arguments aligned with kwargs
+        new_kwargs = {}
+        signature = inspect.signature(init)
+        parameters = {
+            name: p.default for i, (name, p) in enumerate(signature.parameters.items()) if i > 0 and name not in ignore
+        }
+        for arg, name in zip(args, parameters.keys()):
+            new_kwargs[name] = arg
+        # Then add all kwargs
+        new_kwargs.update(
+            {
+                k: init_kwargs.get(k, default)
+                for k, default in parameters.items()
+                if k not in ignore and k not in new_kwargs
+            }
+        )
+        new_kwargs = {**config_init_kwargs, **new_kwargs}
+        getattr(self, "register_to_config")(**new_kwargs)
+        init(self, *args, **init_kwargs)
+    return inner_init

InternLM/internlm/model/norm.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# adopted from https://github.com/NVIDIA/apex/blob/master/apex/normalization/fused_layer_norm
+import numbers
+import torch
+from torch.nn import init
+from torch.nn.parameter import Parameter
+def manual_rms_norm(my_input, normalized_shape, weight, eps):
+    # layer norm should always be calculated in float32
+    dims = tuple(i for i in range(-1, -len(normalized_shape) - 1, -1))
+    variance = my_input.to(torch.float32).pow(2).mean(dims, keepdim=True)
+    my_input = my_input * torch.rsqrt(variance + eps)
+    if weight is None:
+        return my_input
+    # model_hf into half-precision if necessary
+    if weight.dtype in [torch.float16, torch.bfloat16]:
+        my_input = my_input.to(weight.dtype)
+    return weight * my_input
+class RMSNormTorch(torch.nn.Module):
+    """A custom PyTorch module for RMS normalization."""
+    def __init__(self, normalized_shape, eps=1e-5):
+        super().__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.eps = eps
+        self.weight = Parameter(torch.empty(*normalized_shape))
+        self.reset_parameters()
+    def forward(self, _input: torch.Tensor):
+        return manual_rms_norm(_input, self.normalized_shape, self.weight, self.eps)
+    def reset_parameters(self):
+        init.ones_(self.weight)
+    def extra_repr(self):
+        return "{normalized_shape}, eps={eps}, ".format(**self.__dict__)

InternLM/internlm/model/utils.py ADDED Viewed

	@@ -0,0 +1,224 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from flash_attn.ops.fused_dense import FusedDenseFunc
+from flash_attn.utils.distributed import (
+    all_gather_raw,
+    all_reduce_raw,
+    reduce_scatter_raw,
+)
+from torch import Tensor
+from torch.cuda.amp import custom_bwd
+from torch.distributed import ProcessGroup
+from internlm.core.context import global_context as gpc
+from internlm.utils.logger import get_logger
+logger = get_logger(__file__)
+def _split(input_, parallel_mode, dim=-1):
+    # skip if only one rank involved
+    world_size = gpc.get_world_size(parallel_mode)
+    if world_size == 1:
+        return input_
+    # Split along last dimension.
+    dim_size = input_.size(dim)
+    assert dim_size % world_size == 0, (
+        f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), "
+        f"cannot split tensor evenly"
+    )
+    tensor_list = torch.split(input_, dim_size // world_size, dim=dim)
+    rank = gpc.get_local_rank(parallel_mode)
+    output = tensor_list[rank].contiguous()
+    return output
+def _gather(input_, parallel_mode, dim=-1):
+    # skip if only one rank involved
+    world_size = gpc.get_world_size(parallel_mode)
+    if world_size == 1:
+        return input_
+    # all gather
+    rank = gpc.get_local_rank(parallel_mode)
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    group = gpc.get_cpu_group(parallel_mode) if input_.device.type == "cpu" else gpc.get_group(parallel_mode)
+    torch.distributed.all_gather(tensor_list, input_, group=group)
+    # concat
+    output = torch.cat(tensor_list, dim=dim).contiguous()
+    return output
+class _GatherForwardSplitBackward(torch.autograd.Function):
+    """Gather the input from model parallel region and concatenate.
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
+        dim: dimension
+    """
+    @staticmethod
+    def symbolic(input_):
+        return _gather(input_, parallel_mode=None)
+    @staticmethod
+    def forward(ctx, input_, parallel_mode, dim):
+        ctx.mode = parallel_mode
+        ctx.dim = dim
+        return _gather(input_, parallel_mode, dim)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output, ctx.mode, ctx.dim), None, None
+def gather_forward_split_backward(input_, parallel_mode, dim):
+    return _GatherForwardSplitBackward.apply(input_, parallel_mode, dim)
+def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
+    assert my_input.dtype == grad_output.dtype
+    grad_weight = torch.matmul(grad_output.t(), my_input)
+    grad_bias = grad_output.sum(dim=0) if has_d_bias else None
+    return grad_weight, grad_bias
+# adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
+class FusedDenseFuncTorch(FusedDenseFunc):
+    """A custom PyTorch module extending FusedDenseFunc."""
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        if ctx.return_residual:
+            (grad_input,) = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        sequence_parallel = ctx.sequence_parallel
+        if ctx.compute_weight_gradient:
+            x, weight = ctx.saved_tensors
+            if process_group is not None and sequence_parallel:
+                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+            else:
+                total_x = x
+        else:
+            (weight,) = ctx.saved_tensors
+            total_x = None
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_output, weight.t())
+            else:
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight)
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+            if process_group is not None:
+                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
+                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
+        else:
+            grad_input = None
+        if ctx.needs_input_grad[1]:
+            assert ctx.compute_weight_gradient
+            if process_group is not None and sequence_parallel:
+                handle_x.wait()
+            # we remove the cuda independence, which is different from flash_attn.
+            grad_weight, grad_bias = linear_bias_wgrad_torch(
+                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+            )
+        else:
+            grad_weight = None
+            grad_bias = grad_output if ctx.needs_input_grad[2] else None
+        if process_group is not None and ctx.needs_input_grad[0]:
+            handle_grad_input.wait()
+        return grad_input, grad_weight, grad_bias, None, None, None
+def fused_dense_func_torch(
+    x: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    return_residual: bool = False,
+    process_group: Optional[ProcessGroup] = None,
+    sequence_parallel: bool = True,
+):
+    dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
+        x.dtype == torch.float32 and torch.is_autocast_enabled()
+    )
+    if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
+        return FusedDenseFunc.apply(x, weight, bias, return_residual, process_group, sequence_parallel)
+    else:
+        return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel)
+class _SplitForwardGatherBackward(torch.autograd.Function):
+    """
+    Split the input and keep only the corresponding chuck to the rank.
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
+        dim: dimension
+    """
+    @staticmethod
+    def symbolic(input_):
+        return _split(input_, parallel_mode=None)
+    @staticmethod
+    def forward(ctx, input_, parallel_mode, dim):
+        ctx.mode = parallel_mode
+        ctx.dim = dim
+        return _split(input_, parallel_mode, dim)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output, ctx.mode, ctx.dim), None, None
+def split_forward_gather_backward(input_, parallel_mode, dim):
+    return _SplitForwardGatherBackward.apply(input_, parallel_mode, dim)
+def try_import_RMSNorm():
+    """
+    Try import MixFusedRMSNorm from apex, if failed, return our RMSNorm
+    """
+    try:
+        from apex.normalization.fused_layer_norm import MixedFusedRMSNorm as RMSNorm
+        return RMSNorm
+    except ModuleNotFoundError:
+        logger.warning("The torch implementation for MixFusedRMSNorm is slower than apex. Please note this!")
+        from internlm.model.norm import RMSNormTorch as RMSNorm
+        return RMSNorm
+def try_import_LayerNorm():
+    """
+    Try import MixFusedRMSNorm from apex, if failed, return our RMSNorm
+    """
+    try:
+        from apex.normalization.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+        return LayerNorm
+    except ModuleNotFoundError:
+        import torch.nn as nn
+        return nn.LayerNorm