goodjava
diff --git a/‎deepspeed/__init__.py‎
Lines changed: 27 additions & 15 deletions b/‎deepspeed/__init__.py‎
Lines changed: 27 additions & 15 deletions
diff --git a/‎deepspeed/runtime/config_utils.py‎
Lines changed: 17 additions & 0 deletions b/‎deepspeed/runtime/config_utils.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎deepspeed/runtime/zero/config.py‎
Lines changed: 144 additions & 13 deletions b/‎deepspeed/runtime/zero/config.py‎
Lines changed: 144 additions & 13 deletions
diff --git a/‎deepspeed/runtime/zero/offload_config.py‎
Lines changed: 60 additions & 6 deletions b/‎deepspeed/runtime/zero/offload_config.py‎
Lines changed: 60 additions & 6 deletions
@@ -235,27 +235,39 @@ def init_inference(model, config=None, **kwargs):
     Description: all four cases are valid and supported in DS init_inference() API.
 
     # Case 1: user provides no config and no kwargs. Default config will be used.
-    generator.model = deepspeed.init_inference(generator.model)
-    string = generator("DeepSpeed is")
-    print(string)
+
+    .. code-block:: python
+
+        generator.model = deepspeed.init_inference(generator.model)
+        string = generator("DeepSpeed is")
+        print(string)
 
     # Case 2: user provides a config and no kwargs. User supplied config will be used.
-    generator.model = deepspeed.init_inference(generator.model, config=config)
-    string = generator("DeepSpeed is")
-    print(string)
+
+    .. code-block:: python
+
+        generator.model = deepspeed.init_inference(generator.model, config=config)
+        string = generator("DeepSpeed is")
+        print(string)
 
     # Case 3: user provides no config and uses keyword arguments (kwargs) only.
-    generator.model = deepspeed.init_inference(generator.model,
-                                                mp_size=world_size,
-                                                dtype=torch.half,
-                                                replace_with_kernel_inject=True)
-    string = generator("DeepSpeed is")
-    print(string)
+
+    .. code-block:: python
+
+        generator.model = deepspeed.init_inference(generator.model,
+                                                    mp_size=world_size,
+                                                    dtype=torch.half,
+                                                    replace_with_kernel_inject=True)
+        string = generator("DeepSpeed is")
+        print(string)
 
     # Case 4: user provides config and keyword arguments (kwargs). Both config and kwargs are merged and kwargs take precedence.
-    generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
-    string = generator("DeepSpeed is")
-    print(string)
+
+    .. code-block:: python
+
+        generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
+        string = generator("DeepSpeed is")
+        print(string)
 
     Arguments:
         model: Required: original nn.module object without any wrappers
 
@@ -100,6 +100,23 @@ class Config:
         extra = "forbid"
 
 
+class pp_int(int):
+    """
+    A wrapper for integers that will return a custom string or comma-formatted
+    string of the integer. For example, print(pp_int(1e5)) will return
+    "10,000". This is useful mainly for auto-generated documentation purposes.
+    """
+    def __new__(cls, val, custom_print_str=None):
+        inst = super().__new__(cls, val)
+        inst.custom_print_str = custom_print_str
+        return inst
+
+    def __repr__(self):
+        if self.custom_print_str:
+            return self.custom_print_str
+        return f"{self.real:,}"
+
+
 # adapted from https://stackoverflow.com/a/50701137/9201239
 class ScientificNotationEncoder(json.JSONEncoder):
     """
 
@@ -7,7 +7,7 @@
 import sys
 from typing import Optional
 from enum import Enum
-from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigModel
+from deepspeed.runtime.config_utils import get_scalar_param, pp_int, DeepSpeedConfigModel
 from deepspeed.utils import logger
 from .offload_config import DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig, OffloadDeviceEnum
 
@@ -67,6 +67,7 @@ def get_zero_config(param_dict):
 
 
 class ZeroStageEnum(int, Enum):
+    """ Enum class for possible zero stages """
     disabled = 0
     optimizer_states = 1
     gradients = 2
@@ -75,21 +76,86 @@ class ZeroStageEnum(int, Enum):
 
 
 class DeepSpeedZeroConfig(DeepSpeedConfigModel):
-    stage: ZeroStageEnum = ZeroStageEnum.disabled
+    """
+    Sets parameters for ZeRO optimizations.
+    """
+
+    stage: ZeroStageEnum = 0
+    """
+    Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer
+    to disabled, optimizer state partitioning, and optimizer+gradient state
+    partitioning, and optimizer+gradient+parameter partitioning, respectively.
+    """
+
     contiguous_gradients: bool = True
+    """
+    Copies the gradients to a contiguous buffer as they are produced. Avoids
+    memory fragmentation during backward pass.
+    """
+
     reduce_scatter: bool = True
-    reduce_bucket_size: int = Field(5e8, ge=0)
+    """
+    Uses reduce or reduce scatter instead of allreduce to average gradients
+    """
+
+    reduce_bucket_size: int = Field(pp_int(5e8), ge=0)
+    """
+    Number of elements reduced/allreduced at a time. Limits the memory required
+    for the allgather for large model sizes
+    """
+
     allgather_partitions: bool = True
-    allgather_bucket_size: int = Field(5e8, ge=0)
-    overlap_comm: bool = None  # None for dynamic default value
+    """
+    Chooses between allgather collective or a series of broadcast collectives
+    to gather updated parameters from all the GPUs at the end of each step
+    """
+
+    allgather_bucket_size: int = Field(pp_int(5e8), ge=0)
+    """
+    Number of elements allgathered at a time. Limits the memory required for
+    the allgather for large model sizes
+    """
+
+    overlap_comm: bool = None  # None for dynamic default value (see validator `overlap_comm_valid` below)
+    """
+    Attempts to overlap the reduction of the gradients with backward computation
+    """
+
     load_from_fp32_weights: bool = True
+    """
+    Boolean indicating whether to initialize fp32 master weights from fp32
+    copies in checkpoint (no precision loss) or from model's fp16 copies (with
+    precision loss). This can be used to initialize optimizer state even when
+    checkpoint is missing optimizer state.
+    """
 
     elastic_checkpoint: bool = False
+    """
+    Enable loading checkpoint that was saved by job with different GPU count.
+    No longer supported.
+    """
 
-    # Offload Specific Parameters
     offload_param: Optional[DeepSpeedZeroOffloadParamConfig] = None
+    """
+    Enable offloading of model parameters to CPU or NVMe. This frees up GPU
+    memory for larger models or batch sizes. Valid only with stage 3. Expects a
+    dictionary containing values for `DeepSpeedZeroOffloadParamConfig`_.
+    """
+
     offload_optimizer: Optional[DeepSpeedZeroOffloadOptimizerConfig] = None
-    sub_group_size: int = Field(1e9, ge=0)
+    """
+    Enable offloading of optimizer state to CPU or NVMe, and optimizer
+    computation to CPU. This frees up GPU memory for larger models or batch
+    sizes. Valid for ZeRO stage 1, 2, 3. Expects a dictionary containing values
+    for `DeepSpeedZeroOffloadOptimizerConfig`_.
+    """
+
+    sub_group_size: int = Field(pp_int(1e9), ge=0)
+    """
+    Tile size for parameter processing to fit massive models (with trillions of
+    parameters). Used by ZeRO3-Offload and ZeRO-Infinity
+    """
+
     cpu_offload_param: bool = Field(
         None,
         deprecated=True,
@@ -98,12 +164,16 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
             lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu)
             if val else None),
     )
+    """ Deprecated, please use ``offload_param`` """
+
     cpu_offload_use_pin_memory: bool = Field(
         None,
         deprecated=True,
         new_param="offload_param or offload_optimizer",
         set_new_param=False,
     )
+    """ Deprecated, please use ``offload_param`` or ``offload_optimizer`` """
+
     cpu_offload: bool = Field(
         None,
         deprecated=True,
@@ -112,29 +182,90 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
             lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu)
             if val else None),
     )
+    """ Deprecated, please use ``offload_optimizer`` """
+
+    prefetch_bucket_size: int = Field(pp_int(5e7),
+                                      ge=0,
+                                      alias="stage3_prefetch_bucket_size")
+    """
+    Maximum number of parameter elements to fetch ahead of use. Used by ZeRO3,
+    ZeRO3-Offload, ZeRO-Infinity, and ZeRO-Inference.
+    """
 
-    # Stage3 Specific Parameters
-    prefetch_bucket_size: int = Field(5e7, ge=0, alias="stage3_prefetch_bucket_size")
-    param_persistence_threshold: int = Field(1e5,
+    param_persistence_threshold: int = Field(pp_int(1e5),
                                              ge=0,
                                              alias="stage3_param_persistence_threshold")
-    model_persistence_threshold: int = Field(sys.maxsize,
+    """
+    Do not partition parameters smaller than this threshold. Smaller values use
+    less memory, but can greatly increase communication (especially
+    latency-bound messages).
+    """
+
+    model_persistence_threshold: int = Field(pp_int(sys.maxsize,
+                                                    "sys.maxsize"),
                                              ge=0,
                                              alias="stage3_model_persistence_threshold")
-    max_live_parameters: int = Field(1e9, ge=0, alias="stage3_max_live_parameters")
-    max_reuse_distance: int = Field(1e9, ge=0, alias="stage3_max_reuse_distance")
+    """
+    Maximum number of parameter elements that can be persisted in GPU and not
+    partitioned. This imposes an upper bound on the number of unpartitioned
+    parameters resulting from param_persistence_threshold setting. Used by
+    ZeRO3-Offload, ZeRO-Infinity and ZeRO-Inference.
+    """
+
+    max_live_parameters: int = Field(pp_int(1e9),
+                                     ge=0,
+                                     alias="stage3_max_live_parameters")
+    """
+    The maximum number of parameters resident per GPU before releasing. Smaller
+    values use less memory, but perform more communication.
+    """
+
+    max_reuse_distance: int = Field(pp_int(1e9), ge=0, alias="stage3_max_reuse_distance")
+    """
+    Do not release a parameter if it will be reused within this threshold of
+    parameters. Smaller values use less memory, but perform more communication.
+    """
+
     gather_16bit_weights_on_model_save: bool = Field(
         False,
         alias="stage3_gather_16bit_weights_on_model_save")
+    """
+    Consolidate the weights before saving the model by ``save_16bit_model()``.
+    Since the weights are partitioned across GPUs, they aren’t part of
+    ``state_dict``, so this function automatically gathers the weights when
+    this option is enabled and then saves the fp16 model weights.
+    """
+
     stage3_gather_fp16_weights_on_model_save: bool = Field(
         False,
         deprecated=True,
         new_param="gather_16bit_weights_on_model_save")
+    """ Deprecated, please use ``gather_16bit_weights_on_model_save`` """
 
     ignore_unused_parameters: bool = True
+    """
+    Unused parameters in modules may be unexpected in static networks, but
+    could be normal in dynamic networks. This controls whether or not training
+    should terminate with an error message when unused parameters are detected.
+    This is set to ``False`` by default, which means unused parameters are
+    ignored and training continues. Now is just used in stage 2.
+    """
+
     legacy_stage1: bool = False
+    """
+    For backward-compatibility enable old ZeRO stage 1 implementation. Use at
+    your own risk, will be deprecated soon.
+    """
+
     round_robin_gradients: bool = False
+    """
+    Stage 1 and 2 optimization for CPU offloading that parallelizes gradient
+    copying to CPU memory among ranks by fine-grained gradient partitioning.
+    Performance benefit grows with gradient accumulation steps (more copying
+    between optimizer steps) or GPU count (increased parallelism).
+    """
 
+    # Validators
     @validator("overlap_comm")
     def overlap_comm_valid(cls, field_value, values):
         if field_value is None:
 
@@ -6,33 +6,87 @@
 from pydantic import Field, validator
 from enum import Enum
 from pathlib import Path
-from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int
 
 
 class OffloadDeviceEnum(str, Enum):
+    """ Enum for valid offload devices """
     none = "none"
     cpu = "cpu"
     nvme = "nvme"
 
 
 class DeepSpeedZeroOffloadParamConfig(DeepSpeedConfigModel):
-    device: OffloadDeviceEnum = OffloadDeviceEnum.none
+    """ Set options for parameter offload. Valid only with stage 3. """
+
+    device: OffloadDeviceEnum = "none"
+    """
+    Device memory to offload model parameters. Supported options are `cpu` and
+    `nvme`.
+    """
+
     nvme_path: Path = None
+    """ Filesystem path for NVMe device for parameter offloading. """
+
     buffer_count: int = Field(5, ge=0)
-    buffer_size: int = Field(1e8, ge=0)
-    max_in_cpu: int = Field(1e9, ge=0)
+    """ Number of buffers in buffer pool for parameter offloading to NVMe. """
+
+    buffer_size: int = Field(pp_int(1e8), ge=0)
+    """ Size of buffers in buffer pool for parameter offloading to NVMe. """
+
+    max_in_cpu: int = Field(pp_int(1e9), ge=0)
+    """
+    Number of parameter elements to maintain in CPU memory when offloading to
+    NVMe is enabled.
+    """
+
     pin_memory: bool = False
+    """
+    Offload to page-locked CPU memory. This could boost throughput at the cost
+    of extra memory overhead.
+    """
 
 
 class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
-    device: OffloadDeviceEnum = OffloadDeviceEnum.none
+    """ Set options for optimizer offload. Valid with stage 1, 2, and 3. """
+
+    device: OffloadDeviceEnum = "none"
+    """
+    Device memory to offload optimizer state. Supported options are `cpu` and
+    `nvme`. Optimizer computation is offload to CPU regardless of device option.
+    """
+
     nvme_path: Path = None
+    """ Filesystem path for NVMe device for optimizer state offloading. """
+
     buffer_count: int = Field(4, ge=0)
+    """
+    Number of buffers in buffer pool for optimizer state offloading to NVMe.
+    This should be at least the number of states maintained per parameter by
+    the optimizer. For example, Adam optimizer has 4 states (parameter,
+    gradient, momentum, and variance).
+    """
+
     pin_memory: bool = False
+    """
+    Offload to page-locked CPU memory. This could boost throughput at the cost
+    of extra memory overhead.
+    """
+
     pipeline_read: bool = False
+    """
+    For tile-based optimizer step processing, overlap read of next tile with
+    computation of current tile. Used in ZeRO-Infinity.
+    """
+
     pipeline_write: bool = False
-    fast_init: bool = False
+    """
+    For tile-based optimizer step processing, overlap write of previous tile
+    with computation of current tile.
+    """
 
+    fast_init: bool = False
+    """ Enable fast optimizer initialization when offloading to NVMe. """
     @validator("pipeline_read", "pipeline_write", always=True)
     def set_pipeline(cls, field_value, values):
         values["pipeline"] = field_value or values.get("pipeline", False)