NVIDIA · nvpstr · Nov 19, 2019 · Nov 11, 2019
diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py
@@ -356,7 +356,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None,
       # the correct way of using L2 regularization/weight decay with Adam,
       # since that will interact with the m and v parameters in strange ways.
       #
-      # Instead we want ot decay the weights in a manner that doesn't interact
+      # Instead we want to decay the weights in a manner that doesn't interact
       # with the m/v parameters. This is equivalent to adding the square
       # of the weights to the loss with plain (non-momentum) SGD.
       if self._do_use_weight_decay(param_name):