maxdiffusion/src/maxdiffusion/configs/base_2_base.yml at main · AI-Hypercomputer/maxdiffusion

270 lines (234 loc) · 9.5 KB
# Copyright 2023 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#      https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This sentinel is a reminder to choose a real run name.
run_name: ''
metrics_file: "" # for testing, local file that stores scalar metrics. If empty, no metrics are written.
# If true save metrics such as loss and TFLOPS to GCS in {base_output_directory}/{run_name}/metrics/
write_metrics: True
gcs_metrics: False
timing_metrics_file: "" # for testing, local file that stores function timing metrics such as state creation, compilation. If empty, no metrics are written.
write_timing_metrics: True 
# If true save config to GCS in {base_output_directory}/{run_name}/
save_config_to_gcs: False
log_period: 10000000000  # Flushes Tensorboard
pretrained_model_name_or_path: 'stabilityai/stable-diffusion-2-base'
unet_checkpoint: ''
revision: 'main'
# This will convert the weights to this dtype.
weights_dtype: 'float32'
# This sets the layer's dtype in the model. Ex: nn.Dense(dtype=activations_dtype)
activations_dtype: 'bfloat16'
# matmul and conv precision from https://jax.readthedocs.io/en/latest/jax.lax.html#jax.lax.Precision
# Options are "DEFAULT", "HIGH", "HIGHEST"
# fp32 activations and fp32 weights with HIGHEST will provide the best precision
# at the cost of time.
precision: "DEFAULT"
# if False state is not jitted and instead replicate is called. This is good for debugging on single host
# It must be True for multi-host.
jit_initializers: True
# Set true to load weights from pytorch
from_pt: True
split_head_dim: True
attention: 'flash' # Supported attention: dot_product, flash
# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
mask_padding_tokens: True 
# Maxdiffusion has 2 types of attention sharding strategies:
# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
#    in cross attention q.
attention_sharding_uniform: True 
flash_block_sizes: {}
# to override default block sizes for flash attention
# flash_block_sizes:
#   block_q: 64
#   block_kv_compute: 64
#   block_kv: 64
#   block_q_dkv: 64
#   block_kv_dkv: 64
#   block_kv_dkv_compute: 64
#   block_q_dq: 64
#   block_kv_dq: 64
# GroupNorm groups
norm_num_groups: 32
# If train_new_unet, unet weights will be randomly initialized to train the unet from scratch
# else they will be loaded from pretrained_model_name_or_path
train_new_unet: False
# train text_encoder
train_text_encoder: False
text_encoder_learning_rate: 4.25e-6
# https://arxiv.org/pdf/2305.08891.pdf
snr_gamma: -1.0
timestep_bias: {
  # a value of later will increase the frequence of the model's final training steps.
  # none, earlier, later, range
  strategy: "none",
  # multiplier for bias, a value of 2.0 will double the weight of the bias, 0.5 will halve it.
  multiplier: 1.0,
  # when using strategy=range, the beginning (inclusive) timestep to bias.
  begin: 0,
  # when using strategy=range, the final step (inclusive) to bias.
  end: 1000,
  # portion of timesteps to bias.
  # 0.5 will bias one half of the timesteps. Value of strategy determines
  # whether the biased portions are in the earlier or later timesteps.
  portion: 0.25
# Override parameters from checkpoints's scheduler.
diffusion_scheduler_config: {
  _class_name: '',
  # values are v_prediction or leave empty to use scheduler's default.
  prediction_type: '',
  rescale_zero_terminal_snr: False,
  timestep_spacing: ''
hardware: 'tpu' # Supported hardware types are 'tpu', 'gpu'
skip_jax_distributed_system: False
# Output directory
# Create a GCS bucket, e.g. my-maxtext-outputs and set this to "gs://my-maxtext-outputs/"
base_output_directory: ""
# Parallelism
mesh_axes: ['data', 'fsdp', 'context', 'tensor']
# batch : batch dimension of data and activations
# embed : attention qkv dense layer hidden dim named as embed
# heads : attention head dim = num_heads * head_dim
# length : attention sequence length
# temb_in : dense.shape[0] of resnet dense before conv
# out_c : dense.shape[1] of resnet dense before conv
# out_channels : conv.shape[-1] activation
# keep_1 : conv.shape[0] weight
# keep_2 : conv.shape[1] weight
# conv_in : conv.shape[2] weight
# conv_out : conv.shape[-1] weight
logical_axis_rules: [
                      ['batch', 'data'],
                      ['activation_batch', ['data','fsdp']],
                      ['activation_heads', 'tensor'],
                      ['activation_kv', 'tensor'],
                      ['embed','fsdp'],
                      ['heads', 'tensor'],
                      ['conv_batch', ['data','fsdp']],
                      ['out_channels', 'tensor'],
                      ['conv_out', 'fsdp'],
data_sharding: [['data', 'fsdp', 'context', 'tensor']]
# One axis for each parallelism type may hold a placeholder (-1)
# value to auto-shard based on available slices and devices.
# By default, product of the DCN axes should equal number of slices
# and product of the ICI axes should equal number of devices per slice.
dcn_data_parallelism: -1  # recommended DCN axis to be auto-sharded
dcn_fsdp_parallelism: 1
dcn_context_parallelism: 1
dcn_tensor_parallelism: 1
ici_data_parallelism: -1 # recommended ICI axis to be auto-sharded for TPUv5e
ici_fsdp_parallelism: 1  # recommended ICI axis to be auto-sharded
ici_context_parallelism: 1
ici_tensor_parallelism: 1
allow_split_physical_axes: False
# Replace with dataset path or train_data_dir. One has to be set.
dataset_name: 'diffusers/pokemon-gpt4-captions'
train_split: 'train'
dataset_type: 'tf'
cache_latents_text_encoder_outputs: True
# cache_latents_text_encoder_outputs only apply to dataset_type="tf",
# only apply to small dataset that fits in memory
# prepare image latents and text encoder outputs
# Reduce memory consumption and reduce step time during training
# transformed dataset is saved at dataset_save_location
dataset_save_location: '/tmp/pokemon-gpt4-captions'
train_data_dir: ''
dataset_config_name: ''
jax_cache_dir: ''
hf_data_dir: ''
hf_train_files: ''
hf_access_token: ''
grain_train_files: ''
grain_worker_count: 4
image_column: 'image'
caption_column: 'text'
resolution: 512
center_crop: False
random_flip: False
# If cache_latents_text_encoder_outputs is True
# the num_proc is set to 1
tokenize_captions_num_proc: 4
transform_images_num_proc: 4
reuse_example_batch: False
enable_data_shuffling: True
# checkpoint every number of samples, -1 means don't checkpoint.
checkpoint_every: -1
# enables one replica to read the ckpt then broadcast to the rest
enable_single_replica_ckpt_restoring: False
# Training loop
learning_rate: 1.e-7
scale_lr: False
max_train_samples: -1
max_train_steps: 20
output_dir: 'sd-model-finetuned'
tensorboard_dir: ''
per_device_batch_size: 1
warmup_steps_fraction: 0.0
learning_rate_schedule_steps: -1 # By default the length of the schedule is set to the number of steps.
# However you may choose a longer schedule (learning_rate_schedule_steps > steps), in which case the training will end before
# dropping fully down. Or you may choose a shorter schedule, where the unspecified steps will have a learning rate of 0.
# AdamW optimizer parameters
adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradients.
adam_b2: 0.999 # Exponential decay rate to track the second moment of past gradients.
adam_eps: 1.e-8 # A small constant applied to denominator outside of the square root.
adam_weight_decay: 1.e-2 # AdamW Weight decay
opt_enable_grad_clipping: False
max_grad_value: 1.0
opt_enable_grad_global_norm_clipping: False
max_grad_norm: 1.0
enable_profiler: False
# Skip first n steps for profiling, to omit things like compilation and to give
# the iteration time a chance to stabilize.
skip_first_n_steps_for_profiler: 1
profiler_steps: 5
# Generation parameters
prompt: "A magical castle in the middle of a forest, artistic drawing"
negative_prompt: "purple, red"
guidance_scale: 7.5
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
guidance_rescale: 0.0
num_inference_steps: 30
# For preprocessing data to tfrecords
data_files_pattern: ""
extracted_files_dir: ""
tfrecords_dir: ""
no_records_per_shard: 1000
enable_mllog: False
# dreambooth - this script always uses prior preservation.
instance_data_dir: ''
class_data_dir: ''
instance_prompt: ''
class_prompt: ''
prior_loss_weight: 1.0
num_class_images: 100
# If true, set dataset_save_location.
cache_dreambooth_dataset: False
quantization: ''
# Shard the range finding operation for quantization. By default this is set to number of slices.
quantization_local_shard_count: -1
use_qwix_quantization: False 
compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
# ML Diagnostics settings
enable_ml_diagnostics: False
profiler_gcs_path: ""
enable_ondemand_xprof: False
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

base_2_base.yml

Latest commit

History

base_2_base.yml

File metadata and controls