diff --git a/bing_bert/glue_bert_base.json b/bing_bert/glue_bert_base.json new file mode 100755 index 000000000..16c73092e --- /dev/null +++ b/bing_bert/glue_bert_base.json @@ -0,0 +1,19 @@ +{ + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 32, + "steps_per_print": 10, + "optimizer": { + "type": "Adam", + "params": { + "lr": 3e-5, + "weight_decay": 0.0, + "bias_correction": false + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": true + } + + } + \ No newline at end of file diff --git a/bing_bert/glue_bert_large.json b/bing_bert/glue_bert_large.json new file mode 100755 index 000000000..abf4b391d --- /dev/null +++ b/bing_bert/glue_bert_large.json @@ -0,0 +1,19 @@ +{ + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 4, + "steps_per_print": 10, + "optimizer": { + "type": "Adam", + "params": { + "lr": 3e-5, + "weight_decay": 0.0, + "bias_correction": false + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": true + } + + } + \ No newline at end of file diff --git a/bing_bert/run_glue_bert_base_finetune.sh b/bing_bert/run_glue_bert_base_finetune.sh index e62ff8773..7a1c12cfe 100755 --- a/bing_bert/run_glue_bert_base_finetune.sh +++ b/bing_bert/run_glue_bert_base_finetune.sh @@ -37,6 +37,8 @@ run_cmd="python3.6 -m torch.distributed.launch \ --task_name $TASK \ --do_train \ --do_eval \ + --deepspeed \ + --deepspeed_config ${base_dir}/glue_bert_base.json \ --do_lower_case \ --data_dir $GLUE_DIR/$TASK/ \ --bert_model bert-large-uncased \ diff --git a/bing_bert/run_glue_bert_large_finetune.sh b/bing_bert/run_glue_bert_large_finetune.sh new file mode 100755 index 000000000..4a63adb57 --- /dev/null +++ b/bing_bert/run_glue_bert_large_finetune.sh @@ -0,0 +1,57 @@ +LOG_DIR="log" +if [ ! -d "$LOG_DIR" ]; then + mkdir $LOG_DIR +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +NGPU=$1 + +echo "Started scripts" + +TASK=$2 +EFFECTIVE_BATCH_SIZE=$3 +LR=$4 +NUM_EPOCH=$5 +base_dir=`pwd` +model_name="bert_large" +JOBNAME=$6 +CHECKPOINT_PATH=$7 +OUTPUT_DIR="${SCRIPT_DIR}/outputs/${model_name}/${JOBNAME}_bsz${EFFECTIVE_BATCH_SIZE}_lr${LR}_epoch${NUM_EPOCH}" + +GLUE_DIR="/data/GlueData" + +MAX_GPU_BATCH_SIZE=32 +PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU)) +if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then + GRAD_ACCUM_STEPS=1 +else + GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE)) +fi + +echo "Fine Tuning $CHECKPOINT_PATH" +run_cmd="python3.6 -m torch.distributed.launch \ + --nproc_per_node=${NGPU} \ + --master_port=12346 \ + run_glue_classifier_bert_large.py \ + --task_name $TASK \ + --do_train \ + --do_eval \ + --deepspeed \ + --deepspeed_transformer_kernel \ + --fp16 \ + --preln \ + --deepspeed_config ${base_dir}/glue_bert_large.json \ + --do_lower_case \ + --data_dir $GLUE_DIR/$TASK/ \ + --bert_model bert-large-uncased \ + --max_seq_length 128 \ + --train_batch_size ${PER_GPU_BATCH_SIZE} \ + --gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \ + --learning_rate ${LR} \ + --num_train_epochs ${NUM_EPOCH} \ + --output_dir ${OUTPUT_DIR}_${TASK} \ + --model_file $CHECKPOINT_PATH &> $LOG_DIR/${model_name}/${JOBNAME}_${TASK}_bzs${EFFECTIVE_BATCH_SIZE}_lr${LR}_epoch${NUM_EPOCH}_${NGPU}_deepspeed-kernel.txt + " +echo ${run_cmd} +eval ${run_cmd} diff --git a/bing_bert/run_glue_classifier_bert_base.py b/bing_bert/run_glue_classifier_bert_base.py index 36561f1b8..9f46c8a6d 100755 --- a/bing_bert/run_glue_classifier_bert_base.py +++ b/bing_bert/run_glue_classifier_bert_base.py @@ -24,6 +24,7 @@ import random import sys +import deepspeed import numpy as np import torch from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, @@ -690,7 +691,15 @@ def main(): default=False, action='store_true', help="Whether to enable progressive layer dropping or not") - + parser.add_argument( + '--preln', + action='store_true', + default=False, + help= + "Switching to the variant of Transformer blocks that use pre-LayerNorm." + ) + + parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() if args.server_ip and args.server_port: @@ -809,8 +818,10 @@ def main(): if args.progressive_layer_drop: print("BertBaseConfigPreLnLayerDrop") from nvidia.modelingpreln_layerdrop import BertForSequenceClassification, BertConfig + elif args.preln: + from nvidia.modelingpreln import BertForSequenceClassification, BertConfig, BertLayer else: - from nvidia.modelingpreln import BertForSequenceClassification, BertConfig + from nvidia.modeling import BertForSequenceClassification, BertConfig, BertLayer bert_config = BertConfig(**bert_base_model_config) bert_config.vocab_size = len(tokenizer.vocab) @@ -859,6 +870,19 @@ def main(): elif n_gpu > 1: model = torch.nn.DataParallel(model) + # Patch model with deepspeed transformer kernel + if not args.deepspeed_transformer_kernel: + from deepspeed import replace_transformer_layer + model = deepspeed.module_inject.replace_transformer_layer( + orig_layer_impl=BertLayer, + model=model, + micro_batch_size=args.train_batch_size, + bert_config=bert_config, + seed=args.seed, + preln=True, + fp16=args.fp16, + huggingface=False) + # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] @@ -871,29 +895,12 @@ def main(): {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] - if args.fp16: - try: - from apex.optimizers import FP16_Optimizer - from apex.optimizers import FusedAdam - except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) - else: - optimizer = FP16_Optimizer( - optimizer, static_loss_scale=args.loss_scale) - - else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) + + model, optimizer, _, _ = deepspeed.initialize( + args=args, + model=model, + model_parameters=optimizer_grouped_parameters, + dist_init_required=True) global_step = 0 nb_tr_steps = 0 diff --git a/bing_bert/run_glue_classifier_bert_large.py b/bing_bert/run_glue_classifier_bert_large.py new file mode 100755 index 000000000..7d2352d61 --- /dev/null +++ b/bing_bert/run_glue_classifier_bert_large.py @@ -0,0 +1,1282 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import, division, print_function + +import argparse +import csv +import logging +import os +import random +import sys +import deepspeed +import numpy as np +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, + TensorDataset) +from torch.utils.data.distributed import DistributedSampler +from tqdm import tqdm, trange + +from torch.nn import CrossEntropyLoss, MSELoss +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics import matthews_corrcoef, f1_score + +from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE +from pytorch_pretrained_bert.modeling import WEIGHTS_NAME, CONFIG_NAME +from pytorch_pretrained_bert.tokenization import BertTokenizer +from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear +from turing.loss import FocalLoss + +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + +def checkpoint_model(PATH, ckpt_id, model, epoch, last_global_step, + last_global_data_samples, **kwargs): + """Utility function for checkpointing model + optimizer dictionaries + The main purpose for this is to be able to resume training from that instant again + """ + checkpoint_state_dict = { + 'epoch': epoch, + 'last_global_step': last_global_step, + 'last_global_data_samples': last_global_data_samples + } + # Add extra kwargs too + checkpoint_state_dict.update(kwargs) + + #success = model.network.save_checkpoint(PATH, ckpt_id, + success = model.save_checkpoint(PATH, ckpt_id, + checkpoint_state_dict) + status_msg = 'checkpointing: PATH={}, ckpt_id={}'.format(PATH, ckpt_id) + if success: + logging.info(f"Success {status_msg}") + else: + logging.warning(f"Failure {status_msg}") + return + +def load_checkpoint(model, PATH, ckpt_id): + """Utility function for checkpointing model + optimizer dictionaries + The main purpose for this is to be able to resume training from that instant again + """ + model.load_checkpoint(PATH, ckpt_id) + return + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + def __init__(self, input_ids, input_mask, segment_ids, label_id): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding='utf-8') as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + if sys.version_info[0] == 2: + line = list(unicode(cell, 'utf-8') for cell in line) + lines.append(line) + return lines + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + def get_train_examples(self, data_dir): + """See base class.""" + logger.info("LOOKING AT {}".format(os.path.join(data_dir, + "train.tsv"))) + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[3] + text_b = line[4] + label = line[0] + examples.append( + InputExample(guid=guid, + text_a=text_a, + text_b=text_b, + label=label)) + return examples + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[8] + text_b = line[9] + label = line[-1] + examples.append( + InputExample(guid=guid, + text_a=text_a, + text_b=text_b, + label=label)) + return examples + + +class MnliMismatchedProcessor(MnliProcessor): + """Processor for the MultiNLI Mismatched data set (GLUE version).""" + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), + "dev_matched") + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line[3] + label = line[1] + examples.append( + InputExample(guid=guid, + text_a=text_a, + text_b=None, + label=label)) + return examples + + +class Sst2Processor(DataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[0] + label = line[1] + examples.append( + InputExample(guid=guid, + text_a=text_a, + text_b=None, + label=label)) + return examples + + +class StsbProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return [None] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[7] + text_b = line[8] + label = line[-1] + examples.append( + InputExample(guid=guid, + text_a=text_a, + text_b=text_b, + label=label)) + return examples + + +class QqpProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + try: + text_a = line[3] + text_b = line[4] + label = line[5] + except IndexError: + continue + examples.append( + InputExample(guid=guid, + text_a=text_a, + text_b=text_b, + label=label)) + return examples + + +class QnliProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, + text_a=text_a, + text_b=text_b, + label=label)) + return examples + + +class RteProcessor(DataProcessor): + """Processor for the RTE data set (GLUE version).""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, + text_a=text_a, + text_b=text_b, + label=label)) + return examples + + +class WnliProcessor(DataProcessor): + """Processor for the WNLI data set (GLUE version).""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, + text_a=text_a, + text_b=text_b, + label=label)) + return examples + + +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer, output_mode): + """Loads a data file into a list of `InputBatch`s.""" + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + segment_ids = [0] * len(tokens) + + if tokens_b: + tokens += tokens_b + ["[SEP]"] + segment_ids += [1] * (len(tokens_b) + 1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + if output_mode == "classification": + label_id = label_map[example.label] + elif output_mode == "regression": + label_id = float(example.label) + else: + raise KeyError(output_mode) + + if ex_index < 5: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) + for x in input_ids])) + logger.info("input_mask: %s" % + " ".join([str(x) for x in input_mask])) + logger.info("segment_ids: %s" % + " ".join([str(x) for x in segment_ids])) + logger.info("label: %s (id = %d)" % (example.label, label_id)) + + features.append( + InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id)) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def simple_accuracy(preds, labels): + return (preds == labels).mean() + + +def acc_and_f1(preds, labels): + acc = simple_accuracy(preds, labels) + f1 = f1_score(y_true=labels, y_pred=preds) + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + +def pearson_and_spearman(preds, labels): + pearson_corr = pearsonr(preds, labels)[0] + spearman_corr = spearmanr(preds, labels)[0] + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + "corr": (pearson_corr + spearman_corr) / 2, + } + + +def compute_metrics(task_name, preds, labels): + assert len(preds) == len(labels) + if task_name == "cola": + return {"mcc": matthews_corrcoef(labels, preds)} + elif task_name == "sst-2": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "mrpc": + return acc_and_f1(preds, labels) + elif task_name == "sts-b": + return pearson_and_spearman(preds, labels) + elif task_name == "qqp": + return acc_and_f1(preds, labels) + elif task_name == "mnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "mnli-mm": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "qnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "rte": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "wnli": + return {"acc": simple_accuracy(preds, labels)} + else: + raise KeyError(task_name) + + +def main(): + parser = argparse.ArgumentParser() + + # Required parameters + parser.add_argument( + "--data_dir", + default=None, + type=str, + required=True, + help= + "The input data dir. Should contain the .tsv files (or other data files) for the task." + ) + parser.add_argument( + "--bert_model", + default=None, + type=str, + required=True, + help="Bert pre-trained model selected in the list: bert-base-uncased, " + "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " + "bert-base-multilingual-cased, bert-base-chinese.") + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help= + "The output directory where the model predictions and checkpoints will be written." + ) + + # Other parameters + parser.add_argument( + "--cache_dir", + default="", + type=str, + help= + "Where do you want to store the pre-trained models downloaded from s3") + parser.add_argument( + "--max_seq_length", + default=128, + type=int, + help= + "The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_train", + action='store_true', + help="Whether to run training.") + parser.add_argument("--do_eval", + action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument( + "--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument("--train_batch_size", + default=32, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=8, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=5e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument( + "--warmup_proportion", + default=0.1, + type=float, + help= + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument("--local_rank", + type=int, + default=-1, + help="local_rank for distributed training on gpus") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument( + '--gradient_accumulation_steps', + type=int, + default=1, + help= + "Number of updates steps to accumulate before performing a backward/update pass." + ) + parser.add_argument( + '--fp16', + action='store_true', + help="Whether to use 16-bit float precision instead of 32-bit") + #parser.add_argument( + # '--deepscale', + # default=False, + # action='store_true', + # help="Whether to use 16-bit float precision instead of 32-bit") + parser.add_argument( + '--loss_scale', + type=float, + default=0, + help= + "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" + "0 (default value): dynamic loss scaling.\n" + "Positive power of 2: static loss scaling value.\n") + parser.add_argument('--server_ip', + type=str, + default='', + help="Can be used for distant debugging.") + parser.add_argument('--server_port', + type=str, + default='', + help="Can be used for distant debugging.") + parser.add_argument("--model_file", + type=str, + default="0", + help="Path to the Pretrained BERT Encoder File.") + parser.add_argument('--random', + default=False, + action='store_true', + help="Whether to fientune for random initialization") + parser.add_argument('--focal', + default=False, + action='store_true', + help="Whether to use Focal Loss for finetuning.") + parser.add_argument('--gamma', + type=float, + default=0.5, + help="Gamma parameter to be used in focal loss.") + parser.add_argument('--deepspeed_sparse_attention', + default=False, + action='store_true', + help='Use DeepSpeed sparse self attention.') + parser.add_argument( + '--preln', + action='store_true', + default=False, + help= + "Switching to the variant of Transformer blocks that use pre-LayerNorm." + ) + parser.add_argument('--deepspeed_transformer_kernel', + default=False, + action='store_true', + help='Use DeepSpeed transformer kernel to accelerate.') + parser.add_argument( + '--progressive_layer_drop', + default=False, + action='store_true', + help="Whether to enable progressive layer dropping or not") + parser = deepspeed.add_config_arguments(parser) + + args = parser.parse_args() + + if args.server_ip and args.server_port: + # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script + import ptvsd + print("Waiting for debugger attach") + ptvsd.enable_attach(address=(args.server_ip, args.server_port), + redirect_output=True) + ptvsd.wait_for_attach() + + processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mnli-mm": MnliMismatchedProcessor, + "mrpc": MrpcProcessor, + "sst-2": Sst2Processor, + "sts-b": StsbProcessor, + "qqp": QqpProcessor, + "qnli": QnliProcessor, + "rte": RteProcessor, + "wnli": WnliProcessor, + } + + output_modes = { + "cola": "classification", + "mnli": "classification", + "mrpc": "classification", + "sst-2": "classification", + "sts-b": "regression", + "qqp": "classification", + "qnli": "classification", + "rte": "classification", + "wnli": "classification", + } + + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() + and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + logger.info( + "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". + format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) + + if args.gradient_accumulation_steps < 1: + raise ValueError( + "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" + .format(args.gradient_accumulation_steps)) + + args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps + + args.seed = random.randint(1, 1000) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if not args.do_train and not args.do_eval: + raise ValueError( + "At least one of `do_train` or `do_eval` must be True.") + + if (torch.distributed.get_rank() == 0): + # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: + # raise ValueError( + # "Output directory ({}) already exists and is not empty.".format(args.output_dir)) + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + torch.distributed.barrier() + + task_name = args.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + output_mode = output_modes[task_name] + + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(args.bert_model, + do_lower_case=args.do_lower_case) + + train_examples = None + num_train_optimization_steps = None + if args.do_train: + train_examples = processor.get_train_examples(args.data_dir) + num_train_optimization_steps = int( + len(train_examples) / args.train_batch_size / + args.gradient_accumulation_steps) * args.num_train_epochs + if args.local_rank != -1: + num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( + ) + + # Prepare model + cache_dir = args.cache_dir if args.cache_dir else os.path.join( + str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( + args.local_rank)) + + bert_base_model_config = { + "vocab_size_or_config_json_file": 119547, + "hidden_size": 1024, + "num_hidden_layers": 24, + "num_attention_heads": 16, + "intermediate_size": 4096, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 512, + "type_vocab_size": 2, + "initializer_range": 0.02 + } + + if args.progressive_layer_drop: + print("BertBaseConfigPreLnLayerDrop") + from nvidia.modelingpreln_layerdrop import BertForSequenceClassification, BertConfig, BertLayer + elif args.preln: + from nvidia.modelingpreln import BertForSequenceClassification, BertConfig, BertLayer + else: + from nvidia.modeling import BertForSequenceClassification, BertConfig, BertLayer + + bert_config = BertConfig(**bert_base_model_config) + bert_config.vocab_size = len(tokenizer.vocab) + # Padding for divisibility by 8 + if bert_config.vocab_size % 8 != 0: + bert_config.vocab_size += 8 - (bert_config.vocab_size % 8) + + model = BertForSequenceClassification(args, + bert_config, + num_labels=num_labels) + + if args.model_file is not "0": + logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}") + # bert_state_dict = torch.load(args.model_file) + # model.bert.load_state_dict(bert_state_dict) + checkpoint_state_dict = torch.load(args.model_file, + map_location=torch.device("cpu")) + if 'module' in checkpoint_state_dict: + logger.info('Loading DeepSpeed v2.0 style checkpoint') + model.load_state_dict(checkpoint_state_dict['module'], + strict=False) + elif 'model_state_dict' in checkpoint_state_dict: + model.load_state_dict(checkpoint_state_dict['model_state_dict'], + strict=False) + else: + raise ValueError("Unable to find model state in checkpoint") + + logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}") + + if args.random: + logger.info("USING RANDOM INITIALISATION FOR FINETUNING") + model.apply(model.init_bert_weights) + + if args.fp16: + model.half() + model.to(device) + if args.local_rank != -1: + try: + if args.deepscale: + print("Enabling DeepScale") + from deepscale.distributed_apex import DistributedDataParallel as DDP + else: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError( + "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." + ) + + model = DDP(model) + elif n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Patch model with deepspeed transformer kernel + if not args.deepspeed_transformer_kernel: + from deepspeed import replace_transformer_layer + model = deepspeed.module_inject.replace_transformer_layer( + orig_layer_impl=BertLayer, + model=model, + micro_batch_size=args.train_batch_size, + bert_config=bert_config, + seed=args.seed, + preln=arg.preln, + fp16=args.fp16, + huggingface=False) + + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [{ + 'params': + [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], + 'weight_decay': + 0.01 + }, { + 'params': + [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], + 'weight_decay': + 0.0 + }] + + model, optimizer, _, _ = deepspeed.initialize( + args=args, + model=model, + model_parameters=optimizer_grouped_parameters, + dist_init_required=True) + + global_step = 0 + nb_tr_steps = 0 + tr_loss = 0 + if args.do_train: + train_features = convert_examples_to_features(train_examples, + label_list, + args.max_seq_length, + tokenizer, output_mode) + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_examples)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_optimization_steps) + all_input_ids = torch.tensor([f.input_ids for f in train_features], + dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in train_features], + dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in train_features], + dtype=torch.long) + + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in train_features], + dtype=torch.long) + elif output_mode == "regression": + if args.fp16: + all_label_ids = torch.tensor( + [f.label_id for f in train_features], dtype=torch.half) + else: + all_label_ids = torch.tensor( + [f.label_id for f in train_features], dtype=torch.float) + + train_data = TensorDataset(all_input_ids, all_input_mask, + all_segment_ids, all_label_ids) + if args.local_rank == -1: + train_sampler = RandomSampler(train_data) + else: + train_sampler = DistributedSampler(train_data) + train_dataloader = DataLoader(train_data, + sampler=train_sampler, + batch_size=args.train_batch_size) + + model.train() + nb_tr_examples = 0 + for _ in trange(int(args.num_train_epochs), desc="Epoch"): + tr_loss = 0 + nb_tr_examples, nb_tr_steps = 0, 0 + for step, batch in enumerate( + tqdm(train_dataloader, desc="Iteration")): + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, label_ids = batch + + # define a new function to compute loss values for both output_modes + logits = model(input_ids, segment_ids, input_mask, labels=None) + + if output_mode == "classification": + if args.focal: + loss_fct = FocalLoss(class_num=num_labels, + gamma=args.gamma) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, num_labels), + label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), label_ids.view(-1)) + + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + if args.deepscale and args.local_rank != -1: + model.disable_need_reduction() + if (step + 1) % args.gradient_accumulation_steps == 0: + model.enable_need_reduction() + + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() + + tr_loss += loss.item() + nb_tr_examples += input_ids.size(0) + nb_tr_steps += 1 + if (step + 1) % args.gradient_accumulation_steps == 0: + if args.fp16: + # modify learning rate with special warm up BERT uses + # if args.fp16 is False, BertAdam is used that handles this automatically + lr_this_step = args.learning_rate * \ + warmup_linear( + global_step/num_train_optimization_steps, args.warmup_proportion) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + saved_path = os.path.join(args.output_dir, "finetuned_quantized_checkpoints") + + checkpoint_model(PATH=saved_path, + ckpt_id='epoch{}_step{}'.format( + args.num_train_epochs, global_step), + model=model, + epoch=args.num_train_epochs, + last_global_step=global_step, + last_global_data_samples=nb_tr_examples * torch.distributed.get_world_size()) + if args.do_eval and (args.local_rank == -1 + or torch.distributed.get_rank() == 0): + eval_examples = processor.get_dev_examples(args.data_dir) + eval_features = convert_examples_to_features(eval_examples, label_list, + args.max_seq_length, + tokenizer, output_mode) + logger.info("***** Running evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + all_input_ids = torch.tensor([f.input_ids for f in eval_features], + dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in eval_features], + dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], + dtype=torch.long) + + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in eval_features], + dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in eval_features], + dtype=torch.float) + + eval_data = TensorDataset(all_input_ids, all_input_mask, + all_segment_ids, all_label_ids) + # Run prediction for full data + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, + sampler=eval_sampler, + batch_size=args.eval_batch_size) + + model.eval() + eval_loss = 0 + nb_eval_steps = 0 + preds = [] + + for input_ids, input_mask, segment_ids, label_ids in tqdm( + eval_dataloader, desc="Evaluating"): + input_ids = input_ids.to(device) + input_mask = input_mask.to(device) + segment_ids = segment_ids.to(device) + label_ids = label_ids.to(device) + + with torch.no_grad(): + logits = model(input_ids, segment_ids, input_mask, labels=None) + + # create eval loss and other metric required by the task + if output_mode == "classification": + if args.focal: + loss_fct = FocalLoss(class_num=num_labels, + gamma=args.gamma) + else: + loss_fct = CrossEntropyLoss() + tmp_eval_loss = loss_fct(logits.view(-1, num_labels), + label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + print(logits.type()) + print(label_ids.type()) + if task_name == "sts-b": + tmp_eval_loss = loss_fct(logits.float().view(-1), + label_ids.view(-1)) + else: + tmp_eval_loss = loss_fct(logits.view(-1), + label_ids.view(-1)) + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + if len(preds) == 0: + preds.append(logits.detach().cpu().numpy()) + else: + preds[0] = np.append(preds[0], + logits.detach().cpu().numpy(), + axis=0) + + eval_loss = eval_loss / nb_eval_steps + preds = preds[0] + if output_mode == "classification": + preds = np.argmax(preds, axis=1) + elif output_mode == "regression": + preds = np.squeeze(preds) + result = compute_metrics(task_name, preds, all_label_ids.numpy()) + loss = tr_loss / nb_tr_steps if args.do_train else None + + result['eval_loss'] = eval_loss + result['global_step'] = global_step + result['loss'] = loss + + output_eval_file = os.path.join(args.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + # hack for MNLI-MM + if task_name == "mnli": + task_name = "mnli-mm" + processor = processors[task_name]() + + if os.path.exists(args.output_dir + + '-MM') and os.listdir(args.output_dir + + '-MM') and args.do_train: + raise ValueError( + "Output directory ({}{}) already exists and is not empty.". + format(args.output_dir, '-MM')) + if not os.path.exists(args.output_dir + '-MM'): + os.makedirs(args.output_dir + '-MM') + + eval_examples = processor.get_dev_examples(args.data_dir) + eval_features = convert_examples_to_features( + eval_examples, label_list, args.max_seq_length, tokenizer, + output_mode) + logger.info("***** Running evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + all_input_ids = torch.tensor([f.input_ids for f in eval_features], + dtype=torch.long) + all_input_mask = torch.tensor( + [f.input_mask for f in eval_features], dtype=torch.long) + all_segment_ids = torch.tensor( + [f.segment_ids for f in eval_features], dtype=torch.long) + all_label_ids = torch.tensor([f.label_id for f in eval_features], + dtype=torch.long) + + eval_data = TensorDataset(all_input_ids, all_input_mask, + all_segment_ids, all_label_ids) + # Run prediction for full data + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, + sampler=eval_sampler, + batch_size=args.eval_batch_size) + + model.eval() + eval_loss = 0 + nb_eval_steps = 0 + preds = [] + + for input_ids, input_mask, segment_ids, label_ids in tqdm( + eval_dataloader, desc="Evaluating"): + input_ids = input_ids.to(device) + input_mask = input_mask.to(device) + segment_ids = segment_ids.to(device) + label_ids = label_ids.to(device) + + with torch.no_grad(): + logits = model(input_ids, + segment_ids, + input_mask, + labels=None) + + if args.focal: + loss_fct = FocalLoss(class_num=num_labels, + gamma=args.gamma) + else: + loss_fct = CrossEntropyLoss() + tmp_eval_loss = loss_fct(logits.view(-1, num_labels), + label_ids.view(-1)) + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + if len(preds) == 0: + preds.append(logits.detach().cpu().numpy()) + else: + preds[0] = np.append(preds[0], + logits.detach().cpu().numpy(), + axis=0) + + eval_loss = eval_loss / nb_eval_steps + preds = preds[0] + preds = np.argmax(preds, axis=1) + result = compute_metrics(task_name, preds, all_label_ids.numpy()) + loss = tr_loss / nb_tr_steps if args.do_train else None + + result['eval_loss'] = eval_loss + result['global_step'] = global_step + result['loss'] = loss + + output_eval_file = os.path.join(args.output_dir + '-MM', + "eval_results.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +if __name__ == "__main__": + main()