Skip to content

Commit 43bf035

Browse files
authored
Update docs to autogenerate pydantic config model docs (deepspeedai#2509)
* update zero config docs * add autogenerated docs for pydantic models used in ZeRO and Inference configs
1 parent b5d18a6 commit 43bf035

9 files changed

Lines changed: 307 additions & 39 deletions

File tree

deepspeed/__init__.py

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -235,27 +235,39 @@ def init_inference(model, config=None, **kwargs):
235235
Description: all four cases are valid and supported in DS init_inference() API.
236236
237237
# Case 1: user provides no config and no kwargs. Default config will be used.
238-
generator.model = deepspeed.init_inference(generator.model)
239-
string = generator("DeepSpeed is")
240-
print(string)
238+
239+
.. code-block:: python
240+
241+
generator.model = deepspeed.init_inference(generator.model)
242+
string = generator("DeepSpeed is")
243+
print(string)
241244
242245
# Case 2: user provides a config and no kwargs. User supplied config will be used.
243-
generator.model = deepspeed.init_inference(generator.model, config=config)
244-
string = generator("DeepSpeed is")
245-
print(string)
246+
247+
.. code-block:: python
248+
249+
generator.model = deepspeed.init_inference(generator.model, config=config)
250+
string = generator("DeepSpeed is")
251+
print(string)
246252
247253
# Case 3: user provides no config and uses keyword arguments (kwargs) only.
248-
generator.model = deepspeed.init_inference(generator.model,
249-
mp_size=world_size,
250-
dtype=torch.half,
251-
replace_with_kernel_inject=True)
252-
string = generator("DeepSpeed is")
253-
print(string)
254+
255+
.. code-block:: python
256+
257+
generator.model = deepspeed.init_inference(generator.model,
258+
mp_size=world_size,
259+
dtype=torch.half,
260+
replace_with_kernel_inject=True)
261+
string = generator("DeepSpeed is")
262+
print(string)
254263
255264
# Case 4: user provides config and keyword arguments (kwargs). Both config and kwargs are merged and kwargs take precedence.
256-
generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
257-
string = generator("DeepSpeed is")
258-
print(string)
265+
266+
.. code-block:: python
267+
268+
generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
269+
string = generator("DeepSpeed is")
270+
print(string)
259271
260272
Arguments:
261273
model: Required: original nn.module object without any wrappers

deepspeed/runtime/config_utils.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,23 @@ class Config:
100100
extra = "forbid"
101101

102102

103+
class pp_int(int):
104+
"""
105+
A wrapper for integers that will return a custom string or comma-formatted
106+
string of the integer. For example, print(pp_int(1e5)) will return
107+
"10,000". This is useful mainly for auto-generated documentation purposes.
108+
"""
109+
def __new__(cls, val, custom_print_str=None):
110+
inst = super().__new__(cls, val)
111+
inst.custom_print_str = custom_print_str
112+
return inst
113+
114+
def __repr__(self):
115+
if self.custom_print_str:
116+
return self.custom_print_str
117+
return f"{self.real:,}"
118+
119+
103120
# adapted from https://stackoverflow.com/a/50701137/9201239
104121
class ScientificNotationEncoder(json.JSONEncoder):
105122
"""

deepspeed/runtime/zero/config.py

Lines changed: 144 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import sys
88
from typing import Optional
99
from enum import Enum
10-
from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigModel
10+
from deepspeed.runtime.config_utils import get_scalar_param, pp_int, DeepSpeedConfigModel
1111
from deepspeed.utils import logger
1212
from .offload_config import DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig, OffloadDeviceEnum
1313

@@ -67,6 +67,7 @@ def get_zero_config(param_dict):
6767

6868

6969
class ZeroStageEnum(int, Enum):
70+
""" Enum class for possible zero stages """
7071
disabled = 0
7172
optimizer_states = 1
7273
gradients = 2
@@ -75,21 +76,86 @@ class ZeroStageEnum(int, Enum):
7576

7677

7778
class DeepSpeedZeroConfig(DeepSpeedConfigModel):
78-
stage: ZeroStageEnum = ZeroStageEnum.disabled
79+
"""
80+
Sets parameters for ZeRO optimizations.
81+
"""
82+
83+
stage: ZeroStageEnum = 0
84+
"""
85+
Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer
86+
to disabled, optimizer state partitioning, and optimizer+gradient state
87+
partitioning, and optimizer+gradient+parameter partitioning, respectively.
88+
"""
89+
7990
contiguous_gradients: bool = True
91+
"""
92+
Copies the gradients to a contiguous buffer as they are produced. Avoids
93+
memory fragmentation during backward pass.
94+
"""
95+
8096
reduce_scatter: bool = True
81-
reduce_bucket_size: int = Field(5e8, ge=0)
97+
"""
98+
Uses reduce or reduce scatter instead of allreduce to average gradients
99+
"""
100+
101+
reduce_bucket_size: int = Field(pp_int(5e8), ge=0)
102+
"""
103+
Number of elements reduced/allreduced at a time. Limits the memory required
104+
for the allgather for large model sizes
105+
"""
106+
82107
allgather_partitions: bool = True
83-
allgather_bucket_size: int = Field(5e8, ge=0)
84-
overlap_comm: bool = None # None for dynamic default value
108+
"""
109+
Chooses between allgather collective or a series of broadcast collectives
110+
to gather updated parameters from all the GPUs at the end of each step
111+
"""
112+
113+
allgather_bucket_size: int = Field(pp_int(5e8), ge=0)
114+
"""
115+
Number of elements allgathered at a time. Limits the memory required for
116+
the allgather for large model sizes
117+
"""
118+
119+
overlap_comm: bool = None # None for dynamic default value (see validator `overlap_comm_valid` below)
120+
"""
121+
Attempts to overlap the reduction of the gradients with backward computation
122+
"""
123+
85124
load_from_fp32_weights: bool = True
125+
"""
126+
Boolean indicating whether to initialize fp32 master weights from fp32
127+
copies in checkpoint (no precision loss) or from model's fp16 copies (with
128+
precision loss). This can be used to initialize optimizer state even when
129+
checkpoint is missing optimizer state.
130+
"""
86131

87132
elastic_checkpoint: bool = False
133+
"""
134+
Enable loading checkpoint that was saved by job with different GPU count.
135+
No longer supported.
136+
"""
88137

89-
# Offload Specific Parameters
90138
offload_param: Optional[DeepSpeedZeroOffloadParamConfig] = None
139+
"""
140+
Enable offloading of model parameters to CPU or NVMe. This frees up GPU
141+
memory for larger models or batch sizes. Valid only with stage 3. Expects a
142+
dictionary containing values for `DeepSpeedZeroOffloadParamConfig`_.
143+
"""
144+
91145
offload_optimizer: Optional[DeepSpeedZeroOffloadOptimizerConfig] = None
92-
sub_group_size: int = Field(1e9, ge=0)
146+
"""
147+
Enable offloading of optimizer state to CPU or NVMe, and optimizer
148+
computation to CPU. This frees up GPU memory for larger models or batch
149+
sizes. Valid for ZeRO stage 1, 2, 3. Expects a dictionary containing values
150+
for `DeepSpeedZeroOffloadOptimizerConfig`_.
151+
"""
152+
153+
sub_group_size: int = Field(pp_int(1e9), ge=0)
154+
"""
155+
Tile size for parameter processing to fit massive models (with trillions of
156+
parameters). Used by ZeRO3-Offload and ZeRO-Infinity
157+
"""
158+
93159
cpu_offload_param: bool = Field(
94160
None,
95161
deprecated=True,
@@ -98,12 +164,16 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
98164
lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu)
99165
if val else None),
100166
)
167+
""" Deprecated, please use ``offload_param`` """
168+
101169
cpu_offload_use_pin_memory: bool = Field(
102170
None,
103171
deprecated=True,
104172
new_param="offload_param or offload_optimizer",
105173
set_new_param=False,
106174
)
175+
""" Deprecated, please use ``offload_param`` or ``offload_optimizer`` """
176+
107177
cpu_offload: bool = Field(
108178
None,
109179
deprecated=True,
@@ -112,29 +182,90 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
112182
lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu)
113183
if val else None),
114184
)
185+
""" Deprecated, please use ``offload_optimizer`` """
186+
187+
prefetch_bucket_size: int = Field(pp_int(5e7),
188+
ge=0,
189+
alias="stage3_prefetch_bucket_size")
190+
"""
191+
Maximum number of parameter elements to fetch ahead of use. Used by ZeRO3,
192+
ZeRO3-Offload, ZeRO-Infinity, and ZeRO-Inference.
193+
"""
115194

116-
# Stage3 Specific Parameters
117-
prefetch_bucket_size: int = Field(5e7, ge=0, alias="stage3_prefetch_bucket_size")
118-
param_persistence_threshold: int = Field(1e5,
195+
param_persistence_threshold: int = Field(pp_int(1e5),
119196
ge=0,
120197
alias="stage3_param_persistence_threshold")
121-
model_persistence_threshold: int = Field(sys.maxsize,
198+
"""
199+
Do not partition parameters smaller than this threshold. Smaller values use
200+
less memory, but can greatly increase communication (especially
201+
latency-bound messages).
202+
"""
203+
204+
model_persistence_threshold: int = Field(pp_int(sys.maxsize,
205+
"sys.maxsize"),
122206
ge=0,
123207
alias="stage3_model_persistence_threshold")
124-
max_live_parameters: int = Field(1e9, ge=0, alias="stage3_max_live_parameters")
125-
max_reuse_distance: int = Field(1e9, ge=0, alias="stage3_max_reuse_distance")
208+
"""
209+
Maximum number of parameter elements that can be persisted in GPU and not
210+
partitioned. This imposes an upper bound on the number of unpartitioned
211+
parameters resulting from param_persistence_threshold setting. Used by
212+
ZeRO3-Offload, ZeRO-Infinity and ZeRO-Inference.
213+
"""
214+
215+
max_live_parameters: int = Field(pp_int(1e9),
216+
ge=0,
217+
alias="stage3_max_live_parameters")
218+
"""
219+
The maximum number of parameters resident per GPU before releasing. Smaller
220+
values use less memory, but perform more communication.
221+
"""
222+
223+
max_reuse_distance: int = Field(pp_int(1e9), ge=0, alias="stage3_max_reuse_distance")
224+
"""
225+
Do not release a parameter if it will be reused within this threshold of
226+
parameters. Smaller values use less memory, but perform more communication.
227+
"""
228+
126229
gather_16bit_weights_on_model_save: bool = Field(
127230
False,
128231
alias="stage3_gather_16bit_weights_on_model_save")
232+
"""
233+
Consolidate the weights before saving the model by ``save_16bit_model()``.
234+
Since the weights are partitioned across GPUs, they aren’t part of
235+
``state_dict``, so this function automatically gathers the weights when
236+
this option is enabled and then saves the fp16 model weights.
237+
"""
238+
129239
stage3_gather_fp16_weights_on_model_save: bool = Field(
130240
False,
131241
deprecated=True,
132242
new_param="gather_16bit_weights_on_model_save")
243+
""" Deprecated, please use ``gather_16bit_weights_on_model_save`` """
133244

134245
ignore_unused_parameters: bool = True
246+
"""
247+
Unused parameters in modules may be unexpected in static networks, but
248+
could be normal in dynamic networks. This controls whether or not training
249+
should terminate with an error message when unused parameters are detected.
250+
This is set to ``False`` by default, which means unused parameters are
251+
ignored and training continues. Now is just used in stage 2.
252+
"""
253+
135254
legacy_stage1: bool = False
255+
"""
256+
For backward-compatibility enable old ZeRO stage 1 implementation. Use at
257+
your own risk, will be deprecated soon.
258+
"""
259+
136260
round_robin_gradients: bool = False
261+
"""
262+
Stage 1 and 2 optimization for CPU offloading that parallelizes gradient
263+
copying to CPU memory among ranks by fine-grained gradient partitioning.
264+
Performance benefit grows with gradient accumulation steps (more copying
265+
between optimizer steps) or GPU count (increased parallelism).
266+
"""
137267

268+
# Validators
138269
@validator("overlap_comm")
139270
def overlap_comm_valid(cls, field_value, values):
140271
if field_value is None:

deepspeed/runtime/zero/offload_config.py

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,33 +6,87 @@
66
from pydantic import Field, validator
77
from enum import Enum
88
from pathlib import Path
9-
from deepspeed.runtime.config_utils import DeepSpeedConfigModel
9+
from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int
1010

1111

1212
class OffloadDeviceEnum(str, Enum):
13+
""" Enum for valid offload devices """
1314
none = "none"
1415
cpu = "cpu"
1516
nvme = "nvme"
1617

1718

1819
class DeepSpeedZeroOffloadParamConfig(DeepSpeedConfigModel):
19-
device: OffloadDeviceEnum = OffloadDeviceEnum.none
20+
""" Set options for parameter offload. Valid only with stage 3. """
21+
22+
device: OffloadDeviceEnum = "none"
23+
"""
24+
Device memory to offload model parameters. Supported options are `cpu` and
25+
`nvme`.
26+
"""
27+
2028
nvme_path: Path = None
29+
""" Filesystem path for NVMe device for parameter offloading. """
30+
2131
buffer_count: int = Field(5, ge=0)
22-
buffer_size: int = Field(1e8, ge=0)
23-
max_in_cpu: int = Field(1e9, ge=0)
32+
""" Number of buffers in buffer pool for parameter offloading to NVMe. """
33+
34+
buffer_size: int = Field(pp_int(1e8), ge=0)
35+
""" Size of buffers in buffer pool for parameter offloading to NVMe. """
36+
37+
max_in_cpu: int = Field(pp_int(1e9), ge=0)
38+
"""
39+
Number of parameter elements to maintain in CPU memory when offloading to
40+
NVMe is enabled.
41+
"""
42+
2443
pin_memory: bool = False
44+
"""
45+
Offload to page-locked CPU memory. This could boost throughput at the cost
46+
of extra memory overhead.
47+
"""
2548

2649

2750
class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
28-
device: OffloadDeviceEnum = OffloadDeviceEnum.none
51+
""" Set options for optimizer offload. Valid with stage 1, 2, and 3. """
52+
53+
device: OffloadDeviceEnum = "none"
54+
"""
55+
Device memory to offload optimizer state. Supported options are `cpu` and
56+
`nvme`. Optimizer computation is offload to CPU regardless of device option.
57+
"""
58+
2959
nvme_path: Path = None
60+
""" Filesystem path for NVMe device for optimizer state offloading. """
61+
3062
buffer_count: int = Field(4, ge=0)
63+
"""
64+
Number of buffers in buffer pool for optimizer state offloading to NVMe.
65+
This should be at least the number of states maintained per parameter by
66+
the optimizer. For example, Adam optimizer has 4 states (parameter,
67+
gradient, momentum, and variance).
68+
"""
69+
3170
pin_memory: bool = False
71+
"""
72+
Offload to page-locked CPU memory. This could boost throughput at the cost
73+
of extra memory overhead.
74+
"""
75+
3276
pipeline_read: bool = False
77+
"""
78+
For tile-based optimizer step processing, overlap read of next tile with
79+
computation of current tile. Used in ZeRO-Infinity.
80+
"""
81+
3382
pipeline_write: bool = False
34-
fast_init: bool = False
83+
"""
84+
For tile-based optimizer step processing, overlap write of previous tile
85+
with computation of current tile.
86+
"""
3587

88+
fast_init: bool = False
89+
""" Enable fast optimizer initialization when offloading to NVMe. """
3690
@validator("pipeline_read", "pipeline_write", always=True)
3791
def set_pipeline(cls, field_value, values):
3892
values["pipeline"] = field_value or values.get("pipeline", False)

0 commit comments

Comments
 (0)