diff --git a/bing_bert/glue_bert_base.json b/bing_bert/glue_bert_base.json
new file mode 100755
index 000000000..16c73092e
--- /dev/null
+++ b/bing_bert/glue_bert_base.json
@@ -0,0 +1,19 @@
+{
+    "train_batch_size": 32,
+    "train_micro_batch_size_per_gpu": 32,
+    "steps_per_print": 10,
+    "optimizer": {
+      "type": "Adam",
+      "params": {
+        "lr": 3e-5,
+        "weight_decay": 0.0,
+        "bias_correction": false
+      }
+    },
+    "gradient_clipping": 1.0,
+    "fp16": {
+      "enabled": true
+    }
+  
+  }
+  
\ No newline at end of file
diff --git a/bing_bert/glue_bert_large.json b/bing_bert/glue_bert_large.json
new file mode 100755
index 000000000..abf4b391d
--- /dev/null
+++ b/bing_bert/glue_bert_large.json
@@ -0,0 +1,19 @@
+{
+    "train_batch_size": 32,
+    "train_micro_batch_size_per_gpu": 4,
+    "steps_per_print": 10,
+    "optimizer": {
+      "type": "Adam",
+      "params": {
+        "lr": 3e-5,
+        "weight_decay": 0.0,
+        "bias_correction": false
+      }
+    },
+    "gradient_clipping": 1.0,
+    "fp16": {
+      "enabled": true
+    }
+  
+  }
+  
\ No newline at end of file
diff --git a/bing_bert/run_glue_bert_base_finetune.sh b/bing_bert/run_glue_bert_base_finetune.sh
index e62ff8773..7a1c12cfe 100755
--- a/bing_bert/run_glue_bert_base_finetune.sh
+++ b/bing_bert/run_glue_bert_base_finetune.sh
@@ -37,6 +37,8 @@ run_cmd="python3.6 -m torch.distributed.launch \
        --task_name $TASK \
        --do_train \
        --do_eval \
+       --deepspeed \
+       --deepspeed_config ${base_dir}/glue_bert_base.json \
        --do_lower_case \
        --data_dir $GLUE_DIR/$TASK/ \
        --bert_model bert-large-uncased \
diff --git a/bing_bert/run_glue_bert_large_finetune.sh b/bing_bert/run_glue_bert_large_finetune.sh
new file mode 100755
index 000000000..4a63adb57
--- /dev/null
+++ b/bing_bert/run_glue_bert_large_finetune.sh
@@ -0,0 +1,57 @@
+LOG_DIR="log"
+if [ ! -d "$LOG_DIR" ]; then
+  mkdir $LOG_DIR
+fi
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+NGPU=$1
+
+echo "Started scripts"
+
+TASK=$2
+EFFECTIVE_BATCH_SIZE=$3
+LR=$4
+NUM_EPOCH=$5
+base_dir=`pwd`
+model_name="bert_large"
+JOBNAME=$6
+CHECKPOINT_PATH=$7
+OUTPUT_DIR="${SCRIPT_DIR}/outputs/${model_name}/${JOBNAME}_bsz${EFFECTIVE_BATCH_SIZE}_lr${LR}_epoch${NUM_EPOCH}"
+
+GLUE_DIR="/data/GlueData"
+
+MAX_GPU_BATCH_SIZE=32
+PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
+if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
+       GRAD_ACCUM_STEPS=1
+else
+       GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
+fi
+
+echo "Fine Tuning $CHECKPOINT_PATH"
+run_cmd="python3.6 -m torch.distributed.launch \
+       --nproc_per_node=${NGPU} \
+       --master_port=12346 \
+       run_glue_classifier_bert_large.py \
+       --task_name $TASK \
+       --do_train \
+       --do_eval \
+       --deepspeed \
+       --deepspeed_transformer_kernel \
+       --fp16 \
+       --preln \
+       --deepspeed_config ${base_dir}/glue_bert_large.json \
+       --do_lower_case \
+       --data_dir $GLUE_DIR/$TASK/ \
+       --bert_model bert-large-uncased \
+       --max_seq_length 128 \
+       --train_batch_size ${PER_GPU_BATCH_SIZE} \
+       --gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
+       --learning_rate ${LR} \
+       --num_train_epochs ${NUM_EPOCH} \
+       --output_dir ${OUTPUT_DIR}_${TASK} \
+       --model_file $CHECKPOINT_PATH &> $LOG_DIR/${model_name}/${JOBNAME}_${TASK}_bzs${EFFECTIVE_BATCH_SIZE}_lr${LR}_epoch${NUM_EPOCH}_${NGPU}_deepspeed-kernel.txt
+       "
+echo ${run_cmd}
+eval ${run_cmd}
diff --git a/bing_bert/run_glue_classifier_bert_base.py b/bing_bert/run_glue_classifier_bert_base.py
index 36561f1b8..9f46c8a6d 100755
--- a/bing_bert/run_glue_classifier_bert_base.py
+++ b/bing_bert/run_glue_classifier_bert_base.py
@@ -24,6 +24,7 @@
 import random
 import sys
 
+import deepspeed
 import numpy as np
 import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
@@ -690,7 +691,15 @@ def main():
                         default=False,
                         action='store_true',
                         help="Whether to enable progressive layer dropping or not")
-
+    parser.add_argument(
+        '--preln',
+        action='store_true',
+        default=False,
+        help=
+        "Switching to the variant of Transformer blocks that use pre-LayerNorm."
+    )
+
+    parser = deepspeed.add_config_arguments(parser)
     args = parser.parse_args()
 
     if args.server_ip and args.server_port:
@@ -809,8 +818,10 @@ def main():
     if args.progressive_layer_drop:
         print("BertBaseConfigPreLnLayerDrop")
         from nvidia.modelingpreln_layerdrop import BertForSequenceClassification, BertConfig
+    elif args.preln:
+        from nvidia.modelingpreln import BertForSequenceClassification, BertConfig, BertLayer
     else:
-        from nvidia.modelingpreln import BertForSequenceClassification, BertConfig
+        from nvidia.modeling import BertForSequenceClassification, BertConfig, BertLayer
 
     bert_config = BertConfig(**bert_base_model_config)
     bert_config.vocab_size = len(tokenizer.vocab)
@@ -859,6 +870,19 @@ def main():
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
+    # Patch model with deepspeed transformer kernel
+    if not args.deepspeed_transformer_kernel:
+        from deepspeed import replace_transformer_layer
+        model = deepspeed.module_inject.replace_transformer_layer(
+               orig_layer_impl=BertLayer,
+               model=model,
+               micro_batch_size=args.train_batch_size,
+               bert_config=bert_config,
+               seed=args.seed,
+               preln=True,
+               fp16=args.fp16,
+               huggingface=False)
+
     # Prepare optimizer
     param_optimizer = list(model.named_parameters())
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
@@ -871,29 +895,12 @@ def main():
         {'params': [p for n, p in param_optimizer if any(
             nd in n for nd in no_decay)], 'weight_decay': 0.0}
     ]
-    if args.fp16:
-        try:
-            from apex.optimizers import FP16_Optimizer
-            from apex.optimizers import FusedAdam
-        except ImportError:
-            raise ImportError(
-                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        optimizer = FusedAdam(optimizer_grouped_parameters,
-                              lr=args.learning_rate,
-                              bias_correction=False,
-                              max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-        else:
-            optimizer = FP16_Optimizer(
-                optimizer, static_loss_scale=args.loss_scale)
-
-    else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                             lr=args.learning_rate,
-                             warmup=args.warmup_proportion,
-                             t_total=num_train_optimization_steps)
+    
+    model, optimizer, _, _ = deepspeed.initialize(
+        args=args,
+        model=model,
+        model_parameters=optimizer_grouped_parameters,
+        dist_init_required=True)
 
     global_step = 0
     nb_tr_steps = 0
diff --git a/bing_bert/run_glue_classifier_bert_large.py b/bing_bert/run_glue_classifier_bert_large.py
new file mode 100755
index 000000000..7d2352d61
--- /dev/null
+++ b/bing_bert/run_glue_classifier_bert_large.py
@@ -0,0 +1,1282 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import csv
+import logging
+import os
+import random
+import sys
+import deepspeed
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from torch.nn import CrossEntropyLoss, MSELoss
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import matthews_corrcoef, f1_score
+
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from pytorch_pretrained_bert.modeling import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from turing.loss import FocalLoss
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+    datefmt='%m/%d/%Y %H:%M:%S',
+    level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def checkpoint_model(PATH, ckpt_id, model, epoch, last_global_step,
+                     last_global_data_samples, **kwargs):
+    """Utility function for checkpointing model + optimizer dictionaries
+       The main purpose for this is to be able to resume training from that instant again
+    """
+    checkpoint_state_dict = {
+        'epoch': epoch,
+        'last_global_step': last_global_step,
+        'last_global_data_samples': last_global_data_samples
+    }
+    # Add extra kwargs too
+    checkpoint_state_dict.update(kwargs)
+
+    #success = model.network.save_checkpoint(PATH, ckpt_id,
+    success = model.save_checkpoint(PATH, ckpt_id,
+                                            checkpoint_state_dict)
+    status_msg = 'checkpointing: PATH={}, ckpt_id={}'.format(PATH, ckpt_id)
+    if success:
+        logging.info(f"Success {status_msg}")
+    else:
+        logging.warning(f"Failure {status_msg}")
+    return
+
+def load_checkpoint(model, PATH, ckpt_id):
+    """Utility function for checkpointing model + optimizer dictionaries
+       The main purpose for this is to be able to resume training from that instant again
+    """
+    model.load_checkpoint(PATH, ckpt_id)
+    return
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding='utf-8') as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {}".format(os.path.join(data_dir,
+                                                        "train.tsv")))
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
+            examples.append(
+                InputExample(guid=guid,
+                             text_a=text_a,
+                             text_b=text_b,
+                             label=label))
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+            "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8]
+            text_b = line[9]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid,
+                             text_a=text_a,
+                             text_b=text_b,
+                             label=label))
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
+            "dev_matched")
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid,
+                             text_a=text_a,
+                             text_b=None,
+                             label=label))
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[0]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid,
+                             text_a=text_a,
+                             text_b=None,
+                             label=label))
+        return examples
+
+
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return [None]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[7]
+            text_b = line[8]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid,
+                             text_a=text_a,
+                             text_b=text_b,
+                             label=label))
+        return examples
+
+
+class QqpProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            try:
+                text_a = line[3]
+                text_b = line[4]
+                label = line[5]
+            except IndexError:
+                continue
+            examples.append(
+                InputExample(guid=guid,
+                             text_a=text_a,
+                             text_b=text_b,
+                             label=label))
+        return examples
+
+
+class QnliProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid,
+                             text_a=text_a,
+                             text_b=text_b,
+                             label=label))
+        return examples
+
+
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid,
+                             text_a=text_a,
+                             text_b=text_b,
+                             label=label))
+        return examples
+
+
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid,
+                             text_a=text_a,
+                             text_b=text_b,
+                             label=label))
+        return examples
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer, output_mode):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[:(max_seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0   0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)
+
+        if tokens_b:
+            tokens += tokens_b + ["[SEP]"]
+            segment_ids += [1] * (len(tokens_b) + 1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        if output_mode == "classification":
+            label_id = label_map[example.label]
+        elif output_mode == "regression":
+            label_id = float(example.label)
+        else:
+            raise KeyError(output_mode)
+
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x)
+                                                    for x in input_ids]))
+            logger.info("input_mask: %s" %
+                        " ".join([str(x) for x in input_mask]))
+            logger.info("segment_ids: %s" %
+                        " ".join([str(x) for x in segment_ids]))
+            logger.info("label: %s (id = %d)" % (example.label, label_id))
+
+        features.append(
+            InputFeatures(input_ids=input_ids,
+                          input_mask=input_mask,
+                          segment_ids=segment_ids,
+                          label_id=label_id))
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def pearson_and_spearman(preds, labels):
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+
+
+def compute_metrics(task_name, preds, labels):
+    assert len(preds) == len(labels)
+    if task_name == "cola":
+        return {"mcc": matthews_corrcoef(labels, preds)}
+    elif task_name == "sst-2":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mrpc":
+        return acc_and_f1(preds, labels)
+    elif task_name == "sts-b":
+        return pearson_and_spearman(preds, labels)
+    elif task_name == "qqp":
+        return acc_and_f1(preds, labels)
+    elif task_name == "mnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mnli-mm":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "qnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "rte":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "wnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help=
+        "The input data dir. Should contain the .tsv files (or other data files) for the task."
+    )
+    parser.add_argument(
+        "--bert_model",
+        default=None,
+        type=str,
+        required=True,
+        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--task_name",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The name of the task to train.")
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help=
+        "The output directory where the model predictions and checkpoints will be written."
+    )
+
+    # Other parameters
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help=
+        "Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help=
+        "The maximum total input sequence length after WordPiece tokenization. \n"
+        "Sequences longer than this will be truncated, and sequences shorter \n"
+        "than this will be padded.")
+    parser.add_argument("--do_train",
+                        action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval",
+                        action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--do_lower_case",
+        action='store_true',
+        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--train_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size",
+                        default=8,
+                        type=int,
+                        help="Total batch size for eval.")
+    parser.add_argument("--learning_rate",
+                        default=5e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--warmup_proportion",
+        default=0.1,
+        type=float,
+        help=
+        "Proportion of training to perform linear learning rate warmup for. "
+        "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument(
+        '--gradient_accumulation_steps',
+        type=int,
+        default=1,
+        help=
+        "Number of updates steps to accumulate before performing a backward/update pass."
+    )
+    parser.add_argument(
+        '--fp16',
+        action='store_true',
+        help="Whether to use 16-bit float precision instead of 32-bit")
+    #parser.add_argument(
+    #    '--deepscale',
+    #    default=False,
+    #    action='store_true',
+    #    help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument(
+        '--loss_scale',
+        type=float,
+        default=0,
+        help=
+        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+        "0 (default value): dynamic loss scaling.\n"
+        "Positive power of 2: static loss scaling value.\n")
+    parser.add_argument('--server_ip',
+                        type=str,
+                        default='',
+                        help="Can be used for distant debugging.")
+    parser.add_argument('--server_port',
+                        type=str,
+                        default='',
+                        help="Can be used for distant debugging.")
+    parser.add_argument("--model_file",
+                        type=str,
+                        default="0",
+                        help="Path to the Pretrained BERT Encoder File.")
+    parser.add_argument('--random',
+                        default=False,
+                        action='store_true',
+                        help="Whether to fientune for random initialization")
+    parser.add_argument('--focal',
+                        default=False,
+                        action='store_true',
+                        help="Whether to use Focal Loss for finetuning.")
+    parser.add_argument('--gamma',
+                        type=float,
+                        default=0.5,
+                        help="Gamma parameter to be used in focal loss.")
+    parser.add_argument('--deepspeed_sparse_attention',
+                        default=False,
+                        action='store_true',
+                        help='Use DeepSpeed sparse self attention.')
+    parser.add_argument(
+        '--preln',
+        action='store_true',
+        default=False,
+        help=
+        "Switching to the variant of Transformer blocks that use pre-LayerNorm."
+    )
+    parser.add_argument('--deepspeed_transformer_kernel',
+                        default=False,
+                        action='store_true',
+                        help='Use DeepSpeed transformer kernel to accelerate.')
+    parser.add_argument(
+        '--progressive_layer_drop',
+        default=False,
+        action='store_true',
+        help="Whether to enable progressive layer dropping or not")
+    parser = deepspeed.add_config_arguments(parser)
+    
+    args = parser.parse_args()
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
+                            redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    processors = {
+        "cola": ColaProcessor,
+        "mnli": MnliProcessor,
+        "mnli-mm": MnliMismatchedProcessor,
+        "mrpc": MrpcProcessor,
+        "sst-2": Sst2Processor,
+        "sts-b": StsbProcessor,
+        "qqp": QqpProcessor,
+        "qnli": QnliProcessor,
+        "rte": RteProcessor,
+        "wnli": WnliProcessor,
+    }
+
+    output_modes = {
+        "cola": "classification",
+        "mnli": "classification",
+        "mrpc": "classification",
+        "sst-2": "classification",
+        "sts-b": "regression",
+        "qqp": "classification",
+        "qnli": "classification",
+        "rte": "classification",
+        "wnli": "classification",
+    }
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available()
+                              and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+    logger.info(
+        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
+        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError(
+            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
+            .format(args.gradient_accumulation_steps))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    args.seed = random.randint(1, 1000)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if not args.do_train and not args.do_eval:
+        raise ValueError(
+            "At least one of `do_train` or `do_eval` must be True.")
+
+    if (torch.distributed.get_rank() == 0):
+        # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
+        #     raise ValueError(
+        #         "Output directory ({}) already exists and is not empty.".format(args.output_dir))
+        if not os.path.exists(args.output_dir):
+            os.makedirs(args.output_dir)
+
+    torch.distributed.barrier()
+
+    task_name = args.task_name.lower()
+
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % (task_name))
+
+    processor = processors[task_name]()
+    output_mode = output_modes[task_name]
+
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
+                                              do_lower_case=args.do_lower_case)
+
+    train_examples = None
+    num_train_optimization_steps = None
+    if args.do_train:
+        train_examples = processor.get_train_examples(args.data_dir)
+        num_train_optimization_steps = int(
+            len(train_examples) / args.train_batch_size /
+            args.gradient_accumulation_steps) * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
+            )
+
+    # Prepare model
+    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
+        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
+            args.local_rank))
+
+    bert_base_model_config = {
+        "vocab_size_or_config_json_file": 119547,
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02
+    }
+
+    if args.progressive_layer_drop:
+        print("BertBaseConfigPreLnLayerDrop")
+        from nvidia.modelingpreln_layerdrop import BertForSequenceClassification, BertConfig, BertLayer
+    elif args.preln:
+        from nvidia.modelingpreln import BertForSequenceClassification, BertConfig, BertLayer
+    else:
+        from nvidia.modeling import BertForSequenceClassification, BertConfig, BertLayer
+
+    bert_config = BertConfig(**bert_base_model_config)
+    bert_config.vocab_size = len(tokenizer.vocab)
+    # Padding for divisibility by 8
+    if bert_config.vocab_size % 8 != 0:
+        bert_config.vocab_size += 8 - (bert_config.vocab_size % 8)
+
+    model = BertForSequenceClassification(args,
+                                          bert_config,
+                                          num_labels=num_labels)
+
+    if args.model_file is not "0":
+        logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}")
+        # bert_state_dict = torch.load(args.model_file)
+        # model.bert.load_state_dict(bert_state_dict)
+        checkpoint_state_dict = torch.load(args.model_file,
+                                           map_location=torch.device("cpu"))
+        if 'module' in checkpoint_state_dict:
+            logger.info('Loading DeepSpeed v2.0 style checkpoint')
+            model.load_state_dict(checkpoint_state_dict['module'],
+                                  strict=False)
+        elif 'model_state_dict' in checkpoint_state_dict:
+            model.load_state_dict(checkpoint_state_dict['model_state_dict'],
+                                  strict=False)
+        else:
+            raise ValueError("Unable to find model state in checkpoint")
+
+        logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}")
+
+    if args.random:
+        logger.info("USING RANDOM INITIALISATION FOR FINETUNING")
+        model.apply(model.init_bert_weights)
+
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        try:
+            if args.deepscale:
+                print("Enabling DeepScale")
+                from deepscale.distributed_apex import DistributedDataParallel as DDP
+            else:
+                from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError(
+                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
+            )
+
+        model = DDP(model)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Patch model with deepspeed transformer kernel
+    if not args.deepspeed_transformer_kernel:
+        from deepspeed import replace_transformer_layer
+        model = deepspeed.module_inject.replace_transformer_layer(
+               orig_layer_impl=BertLayer,
+               model=model,
+               micro_batch_size=args.train_batch_size,
+               bert_config=bert_config,
+               seed=args.seed,
+               preln=arg.preln,
+               fp16=args.fp16,
+               huggingface=False)
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [{
+        'params':
+        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+        'weight_decay':
+        0.01
+    }, {
+        'params':
+        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
+        'weight_decay':
+        0.0
+    }]
+
+    model, optimizer, _, _ = deepspeed.initialize(
+        args=args,
+        model=model,
+        model_parameters=optimizer_grouped_parameters,
+        dist_init_required=True)
+
+    global_step = 0
+    nb_tr_steps = 0
+    tr_loss = 0
+    if args.do_train:
+        train_features = convert_examples_to_features(train_examples,
+                                                      label_list,
+                                                      args.max_seq_length,
+                                                      tokenizer, output_mode)
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+        all_input_ids = torch.tensor([f.input_ids for f in train_features],
+                                     dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features],
+                                      dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
+                                       dtype=torch.long)
+
+        if output_mode == "classification":
+            all_label_ids = torch.tensor([f.label_id for f in train_features],
+                                         dtype=torch.long)
+        elif output_mode == "regression":
+            if args.fp16:
+                all_label_ids = torch.tensor(
+                    [f.label_id for f in train_features], dtype=torch.half)
+            else:
+                all_label_ids = torch.tensor(
+                    [f.label_id for f in train_features], dtype=torch.float)
+
+        train_data = TensorDataset(all_input_ids, all_input_mask,
+                                   all_segment_ids, all_label_ids)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data,
+                                      sampler=train_sampler,
+                                      batch_size=args.train_batch_size)
+
+        model.train()
+        nb_tr_examples = 0
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0
+            nb_tr_examples, nb_tr_steps = 0, 0
+            for step, batch in enumerate(
+                    tqdm(train_dataloader, desc="Iteration")):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, label_ids = batch
+
+                # define a new function to compute loss values for both output_modes
+                logits = model(input_ids, segment_ids, input_mask, labels=None)
+
+                if output_mode == "classification":
+                    if args.focal:
+                        loss_fct = FocalLoss(class_num=num_labels,
+                                             gamma=args.gamma)
+                    else:
+                        loss_fct = CrossEntropyLoss()
+                    loss = loss_fct(logits.view(-1, num_labels),
+                                    label_ids.view(-1))
+                elif output_mode == "regression":
+                    loss_fct = MSELoss()
+                    loss = loss_fct(logits.view(-1), label_ids.view(-1))
+
+                if n_gpu > 1:
+                    loss = loss.mean()  # mean() to average on multi-gpu.
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                if args.deepscale and args.local_rank != -1:
+                    model.disable_need_reduction()
+                    if (step + 1) % args.gradient_accumulation_steps == 0:
+                        model.enable_need_reduction()
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+
+                tr_loss += loss.item()
+                nb_tr_examples += input_ids.size(0)
+                nb_tr_steps += 1
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used that handles this automatically
+                        lr_this_step = args.learning_rate * \
+                            warmup_linear(
+                                global_step/num_train_optimization_steps, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+           
+        saved_path = os.path.join(args.output_dir, "finetuned_quantized_checkpoints")
+
+        checkpoint_model(PATH=saved_path,
+                             ckpt_id='epoch{}_step{}'.format(
+                                 args.num_train_epochs, global_step),
+                             model=model,
+                             epoch=args.num_train_epochs,
+                             last_global_step=global_step,
+                             last_global_data_samples=nb_tr_examples * torch.distributed.get_world_size())
+    if args.do_eval and (args.local_rank == -1
+                         or torch.distributed.get_rank() == 0):
+        eval_examples = processor.get_dev_examples(args.data_dir)
+        eval_features = convert_examples_to_features(eval_examples, label_list,
+                                                     args.max_seq_length,
+                                                     tokenizer, output_mode)
+        logger.info("***** Running evaluation *****")
+        logger.info("  Num examples = %d", len(eval_examples))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
+                                     dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
+                                      dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
+                                       dtype=torch.long)
+
+        if output_mode == "classification":
+            all_label_ids = torch.tensor([f.label_id for f in eval_features],
+                                         dtype=torch.long)
+        elif output_mode == "regression":
+            all_label_ids = torch.tensor([f.label_id for f in eval_features],
+                                         dtype=torch.float)
+
+        eval_data = TensorDataset(all_input_ids, all_input_mask,
+                                  all_segment_ids, all_label_ids)
+        # Run prediction for full data
+        eval_sampler = SequentialSampler(eval_data)
+        eval_dataloader = DataLoader(eval_data,
+                                     sampler=eval_sampler,
+                                     batch_size=args.eval_batch_size)
+
+        model.eval()
+        eval_loss = 0
+        nb_eval_steps = 0
+        preds = []
+
+        for input_ids, input_mask, segment_ids, label_ids in tqdm(
+                eval_dataloader, desc="Evaluating"):
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+            segment_ids = segment_ids.to(device)
+            label_ids = label_ids.to(device)
+
+            with torch.no_grad():
+                logits = model(input_ids, segment_ids, input_mask, labels=None)
+
+            # create eval loss and other metric required by the task
+            if output_mode == "classification":
+                if args.focal:
+                    loss_fct = FocalLoss(class_num=num_labels,
+                                         gamma=args.gamma)
+                else:
+                    loss_fct = CrossEntropyLoss()
+                tmp_eval_loss = loss_fct(logits.view(-1, num_labels),
+                                         label_ids.view(-1))
+            elif output_mode == "regression":
+                loss_fct = MSELoss()
+                print(logits.type())
+                print(label_ids.type())
+                if task_name == "sts-b":
+                    tmp_eval_loss = loss_fct(logits.float().view(-1),
+                                             label_ids.view(-1))
+                else:
+                    tmp_eval_loss = loss_fct(logits.view(-1),
+                                             label_ids.view(-1))
+
+            eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if len(preds) == 0:
+                preds.append(logits.detach().cpu().numpy())
+            else:
+                preds[0] = np.append(preds[0],
+                                     logits.detach().cpu().numpy(),
+                                     axis=0)
+
+        eval_loss = eval_loss / nb_eval_steps
+        preds = preds[0]
+        if output_mode == "classification":
+            preds = np.argmax(preds, axis=1)
+        elif output_mode == "regression":
+            preds = np.squeeze(preds)
+        result = compute_metrics(task_name, preds, all_label_ids.numpy())
+        loss = tr_loss / nb_tr_steps if args.do_train else None
+
+        result['eval_loss'] = eval_loss
+        result['global_step'] = global_step
+        result['loss'] = loss
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+        # hack for MNLI-MM
+        if task_name == "mnli":
+            task_name = "mnli-mm"
+            processor = processors[task_name]()
+
+            if os.path.exists(args.output_dir +
+                              '-MM') and os.listdir(args.output_dir +
+                                                    '-MM') and args.do_train:
+                raise ValueError(
+                    "Output directory ({}{}) already exists and is not empty.".
+                    format(args.output_dir, '-MM'))
+            if not os.path.exists(args.output_dir + '-MM'):
+                os.makedirs(args.output_dir + '-MM')
+
+            eval_examples = processor.get_dev_examples(args.data_dir)
+            eval_features = convert_examples_to_features(
+                eval_examples, label_list, args.max_seq_length, tokenizer,
+                output_mode)
+            logger.info("***** Running evaluation *****")
+            logger.info("  Num examples = %d", len(eval_examples))
+            logger.info("  Batch size = %d", args.eval_batch_size)
+            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
+                                         dtype=torch.long)
+            all_input_mask = torch.tensor(
+                [f.input_mask for f in eval_features], dtype=torch.long)
+            all_segment_ids = torch.tensor(
+                [f.segment_ids for f in eval_features], dtype=torch.long)
+            all_label_ids = torch.tensor([f.label_id for f in eval_features],
+                                         dtype=torch.long)
+
+            eval_data = TensorDataset(all_input_ids, all_input_mask,
+                                      all_segment_ids, all_label_ids)
+            # Run prediction for full data
+            eval_sampler = SequentialSampler(eval_data)
+            eval_dataloader = DataLoader(eval_data,
+                                         sampler=eval_sampler,
+                                         batch_size=args.eval_batch_size)
+
+            model.eval()
+            eval_loss = 0
+            nb_eval_steps = 0
+            preds = []
+
+            for input_ids, input_mask, segment_ids, label_ids in tqdm(
+                    eval_dataloader, desc="Evaluating"):
+                input_ids = input_ids.to(device)
+                input_mask = input_mask.to(device)
+                segment_ids = segment_ids.to(device)
+                label_ids = label_ids.to(device)
+
+                with torch.no_grad():
+                    logits = model(input_ids,
+                                   segment_ids,
+                                   input_mask,
+                                   labels=None)
+
+                if args.focal:
+                    loss_fct = FocalLoss(class_num=num_labels,
+                                         gamma=args.gamma)
+                else:
+                    loss_fct = CrossEntropyLoss()
+                tmp_eval_loss = loss_fct(logits.view(-1, num_labels),
+                                         label_ids.view(-1))
+
+                eval_loss += tmp_eval_loss.mean().item()
+                nb_eval_steps += 1
+                if len(preds) == 0:
+                    preds.append(logits.detach().cpu().numpy())
+                else:
+                    preds[0] = np.append(preds[0],
+                                         logits.detach().cpu().numpy(),
+                                         axis=0)
+
+            eval_loss = eval_loss / nb_eval_steps
+            preds = preds[0]
+            preds = np.argmax(preds, axis=1)
+            result = compute_metrics(task_name, preds, all_label_ids.numpy())
+            loss = tr_loss / nb_tr_steps if args.do_train else None
+
+            result['eval_loss'] = eval_loss
+            result['global_step'] = global_step
+            result['loss'] = loss
+
+            output_eval_file = os.path.join(args.output_dir + '-MM',
+                                            "eval_results.txt")
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key in sorted(result.keys()):
+                    logger.info("  %s = %s", key, str(result[key]))
+                    writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+if __name__ == "__main__":
+    main()