Skip to content

Commit 400cd1b

Browse files
Add deepspeed for the glue finetuning tasks & bert-large scripts (deepspeedai#77)
* Add deepspeed for the glue finetuning tasks & add bert-large scripts and configs * change preln argument description
1 parent 2d4282b commit 400cd1b

6 files changed

Lines changed: 1411 additions & 25 deletions

bing_bert/glue_bert_base.json

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"train_batch_size": 32,
3+
"train_micro_batch_size_per_gpu": 32,
4+
"steps_per_print": 10,
5+
"optimizer": {
6+
"type": "Adam",
7+
"params": {
8+
"lr": 3e-5,
9+
"weight_decay": 0.0,
10+
"bias_correction": false
11+
}
12+
},
13+
"gradient_clipping": 1.0,
14+
"fp16": {
15+
"enabled": true
16+
}
17+
18+
}
19+

bing_bert/glue_bert_large.json

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"train_batch_size": 32,
3+
"train_micro_batch_size_per_gpu": 4,
4+
"steps_per_print": 10,
5+
"optimizer": {
6+
"type": "Adam",
7+
"params": {
8+
"lr": 3e-5,
9+
"weight_decay": 0.0,
10+
"bias_correction": false
11+
}
12+
},
13+
"gradient_clipping": 1.0,
14+
"fp16": {
15+
"enabled": true
16+
}
17+
18+
}
19+

bing_bert/run_glue_bert_base_finetune.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ run_cmd="python3.6 -m torch.distributed.launch \
3737
--task_name $TASK \
3838
--do_train \
3939
--do_eval \
40+
--deepspeed \
41+
--deepspeed_config ${base_dir}/glue_bert_base.json \
4042
--do_lower_case \
4143
--data_dir $GLUE_DIR/$TASK/ \
4244
--bert_model bert-large-uncased \
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
LOG_DIR="log"
2+
if [ ! -d "$LOG_DIR" ]; then
3+
mkdir $LOG_DIR
4+
fi
5+
6+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
7+
8+
NGPU=$1
9+
10+
echo "Started scripts"
11+
12+
TASK=$2
13+
EFFECTIVE_BATCH_SIZE=$3
14+
LR=$4
15+
NUM_EPOCH=$5
16+
base_dir=`pwd`
17+
model_name="bert_large"
18+
JOBNAME=$6
19+
CHECKPOINT_PATH=$7
20+
OUTPUT_DIR="${SCRIPT_DIR}/outputs/${model_name}/${JOBNAME}_bsz${EFFECTIVE_BATCH_SIZE}_lr${LR}_epoch${NUM_EPOCH}"
21+
22+
GLUE_DIR="/data/GlueData"
23+
24+
MAX_GPU_BATCH_SIZE=32
25+
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
26+
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
27+
GRAD_ACCUM_STEPS=1
28+
else
29+
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
30+
fi
31+
32+
echo "Fine Tuning $CHECKPOINT_PATH"
33+
run_cmd="python3.6 -m torch.distributed.launch \
34+
--nproc_per_node=${NGPU} \
35+
--master_port=12346 \
36+
run_glue_classifier_bert_large.py \
37+
--task_name $TASK \
38+
--do_train \
39+
--do_eval \
40+
--deepspeed \
41+
--deepspeed_transformer_kernel \
42+
--fp16 \
43+
--preln \
44+
--deepspeed_config ${base_dir}/glue_bert_large.json \
45+
--do_lower_case \
46+
--data_dir $GLUE_DIR/$TASK/ \
47+
--bert_model bert-large-uncased \
48+
--max_seq_length 128 \
49+
--train_batch_size ${PER_GPU_BATCH_SIZE} \
50+
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
51+
--learning_rate ${LR} \
52+
--num_train_epochs ${NUM_EPOCH} \
53+
--output_dir ${OUTPUT_DIR}_${TASK} \
54+
--model_file $CHECKPOINT_PATH &> $LOG_DIR/${model_name}/${JOBNAME}_${TASK}_bzs${EFFECTIVE_BATCH_SIZE}_lr${LR}_epoch${NUM_EPOCH}_${NGPU}_deepspeed-kernel.txt
55+
"
56+
echo ${run_cmd}
57+
eval ${run_cmd}

bing_bert/run_glue_classifier_bert_base.py

Lines changed: 32 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import random
2525
import sys
2626

27+
import deepspeed
2728
import numpy as np
2829
import torch
2930
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
@@ -690,7 +691,15 @@ def main():
690691
default=False,
691692
action='store_true',
692693
help="Whether to enable progressive layer dropping or not")
693-
694+
parser.add_argument(
695+
'--preln',
696+
action='store_true',
697+
default=False,
698+
help=
699+
"Switching to the variant of Transformer blocks that use pre-LayerNorm."
700+
)
701+
702+
parser = deepspeed.add_config_arguments(parser)
694703
args = parser.parse_args()
695704

696705
if args.server_ip and args.server_port:
@@ -809,8 +818,10 @@ def main():
809818
if args.progressive_layer_drop:
810819
print("BertBaseConfigPreLnLayerDrop")
811820
from nvidia.modelingpreln_layerdrop import BertForSequenceClassification, BertConfig
821+
elif args.preln:
822+
from nvidia.modelingpreln import BertForSequenceClassification, BertConfig, BertLayer
812823
else:
813-
from nvidia.modelingpreln import BertForSequenceClassification, BertConfig
824+
from nvidia.modeling import BertForSequenceClassification, BertConfig, BertLayer
814825

815826
bert_config = BertConfig(**bert_base_model_config)
816827
bert_config.vocab_size = len(tokenizer.vocab)
@@ -859,6 +870,19 @@ def main():
859870
elif n_gpu > 1:
860871
model = torch.nn.DataParallel(model)
861872

873+
# Patch model with deepspeed transformer kernel
874+
if not args.deepspeed_transformer_kernel:
875+
from deepspeed import replace_transformer_layer
876+
model = deepspeed.module_inject.replace_transformer_layer(
877+
orig_layer_impl=BertLayer,
878+
model=model,
879+
micro_batch_size=args.train_batch_size,
880+
bert_config=bert_config,
881+
seed=args.seed,
882+
preln=True,
883+
fp16=args.fp16,
884+
huggingface=False)
885+
862886
# Prepare optimizer
863887
param_optimizer = list(model.named_parameters())
864888
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
@@ -871,29 +895,12 @@ def main():
871895
{'params': [p for n, p in param_optimizer if any(
872896
nd in n for nd in no_decay)], 'weight_decay': 0.0}
873897
]
874-
if args.fp16:
875-
try:
876-
from apex.optimizers import FP16_Optimizer
877-
from apex.optimizers import FusedAdam
878-
except ImportError:
879-
raise ImportError(
880-
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
881-
882-
optimizer = FusedAdam(optimizer_grouped_parameters,
883-
lr=args.learning_rate,
884-
bias_correction=False,
885-
max_grad_norm=1.0)
886-
if args.loss_scale == 0:
887-
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
888-
else:
889-
optimizer = FP16_Optimizer(
890-
optimizer, static_loss_scale=args.loss_scale)
891-
892-
else:
893-
optimizer = BertAdam(optimizer_grouped_parameters,
894-
lr=args.learning_rate,
895-
warmup=args.warmup_proportion,
896-
t_total=num_train_optimization_steps)
898+
899+
model, optimizer, _, _ = deepspeed.initialize(
900+
args=args,
901+
model=model,
902+
model_parameters=optimizer_grouped_parameters,
903+
dist_init_required=True)
897904

898905
global_step = 0
899906
nb_tr_steps = 0

0 commit comments

Comments
 (0)