Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions bing_bert/glue_bert_base.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"train_batch_size": 32,
"train_micro_batch_size_per_gpu": 32,
"steps_per_print": 10,
"optimizer": {
"type": "Adam",
"params": {
"lr": 3e-5,
"weight_decay": 0.0,
"bias_correction": false
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true
}

}

19 changes: 19 additions & 0 deletions bing_bert/glue_bert_large.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"train_batch_size": 32,
"train_micro_batch_size_per_gpu": 4,
"steps_per_print": 10,
"optimizer": {
"type": "Adam",
"params": {
"lr": 3e-5,
"weight_decay": 0.0,
"bias_correction": false
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true
}

}

2 changes: 2 additions & 0 deletions bing_bert/run_glue_bert_base_finetune.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ run_cmd="python3.6 -m torch.distributed.launch \
--task_name $TASK \
--do_train \
--do_eval \
--deepspeed \
--deepspeed_config ${base_dir}/glue_bert_base.json \
--do_lower_case \
--data_dir $GLUE_DIR/$TASK/ \
--bert_model bert-large-uncased \
Expand Down
57 changes: 57 additions & 0 deletions bing_bert/run_glue_bert_large_finetune.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
LOG_DIR="log"
if [ ! -d "$LOG_DIR" ]; then
mkdir $LOG_DIR
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

NGPU=$1

echo "Started scripts"

TASK=$2
EFFECTIVE_BATCH_SIZE=$3
LR=$4
NUM_EPOCH=$5
base_dir=`pwd`
model_name="bert_large"
JOBNAME=$6
CHECKPOINT_PATH=$7
OUTPUT_DIR="${SCRIPT_DIR}/outputs/${model_name}/${JOBNAME}_bsz${EFFECTIVE_BATCH_SIZE}_lr${LR}_epoch${NUM_EPOCH}"

GLUE_DIR="/data/GlueData"

MAX_GPU_BATCH_SIZE=32
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
GRAD_ACCUM_STEPS=1
else
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi

echo "Fine Tuning $CHECKPOINT_PATH"
run_cmd="python3.6 -m torch.distributed.launch \
--nproc_per_node=${NGPU} \
--master_port=12346 \
run_glue_classifier_bert_large.py \
--task_name $TASK \
--do_train \
--do_eval \
--deepspeed \
--deepspeed_transformer_kernel \
--fp16 \
--preln \
--deepspeed_config ${base_dir}/glue_bert_large.json \
--do_lower_case \
--data_dir $GLUE_DIR/$TASK/ \
--bert_model bert-large-uncased \
--max_seq_length 128 \
--train_batch_size ${PER_GPU_BATCH_SIZE} \
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
--learning_rate ${LR} \
--num_train_epochs ${NUM_EPOCH} \
--output_dir ${OUTPUT_DIR}_${TASK} \
--model_file $CHECKPOINT_PATH &> $LOG_DIR/${model_name}/${JOBNAME}_${TASK}_bzs${EFFECTIVE_BATCH_SIZE}_lr${LR}_epoch${NUM_EPOCH}_${NGPU}_deepspeed-kernel.txt
"
echo ${run_cmd}
eval ${run_cmd}
57 changes: 32 additions & 25 deletions bing_bert/run_glue_classifier_bert_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import random
import sys

import deepspeed
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
Expand Down Expand Up @@ -690,7 +691,15 @@ def main():
default=False,
action='store_true',
help="Whether to enable progressive layer dropping or not")

parser.add_argument(
'--preln',
action='store_true',
default=False,
help=
"Switching to the variant of Transformer blocks that use pre-LayerNorm."
)

parser = deepspeed.add_config_arguments(parser)
args = parser.parse_args()

if args.server_ip and args.server_port:
Expand Down Expand Up @@ -809,8 +818,10 @@ def main():
if args.progressive_layer_drop:
print("BertBaseConfigPreLnLayerDrop")
from nvidia.modelingpreln_layerdrop import BertForSequenceClassification, BertConfig
elif args.preln:
from nvidia.modelingpreln import BertForSequenceClassification, BertConfig, BertLayer
else:
from nvidia.modelingpreln import BertForSequenceClassification, BertConfig
from nvidia.modeling import BertForSequenceClassification, BertConfig, BertLayer

bert_config = BertConfig(**bert_base_model_config)
bert_config.vocab_size = len(tokenizer.vocab)
Expand Down Expand Up @@ -859,6 +870,19 @@ def main():
elif n_gpu > 1:
model = torch.nn.DataParallel(model)

# Patch model with deepspeed transformer kernel
if not args.deepspeed_transformer_kernel:
from deepspeed import replace_transformer_layer
model = deepspeed.module_inject.replace_transformer_layer(
orig_layer_impl=BertLayer,
model=model,
micro_batch_size=args.train_batch_size,
bert_config=bert_config,
seed=args.seed,
preln=True,
fp16=args.fp16,
huggingface=False)

# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
Expand All @@ -871,29 +895,12 @@ def main():
{'params': [p for n, p in param_optimizer if any(
nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(
optimizer, static_loss_scale=args.loss_scale)

else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)

model, optimizer, _, _ = deepspeed.initialize(
args=args,
model=model,
model_parameters=optimizer_grouped_parameters,
dist_init_required=True)

global_step = 0
nb_tr_steps = 0
Expand Down
Loading