Skip to content

Commit 47766e0

Browse files
authored
Support nvidia bert dataset (deepspeedai#27)
* Support nvidia bert dataset * Format fixes * E2E run of Nvidia Data with SQUAD 90.6 F1 * Minor fixes * Update README * Update README
1 parent fd6fb51 commit 47766e0

10 files changed

Lines changed: 494 additions & 106 deletions

bing_bert/README.md

100644100755
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,12 @@ the same number and generation of GPUs.
1111
* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html).
1212
* Tutorial on how to reproduce our results, see our [BERT pre-training tutorial](https://www.deepspeed.ai/tutorials/bert-pretraining/).
1313
* The source code for our transformer kernels can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed).
14+
15+
16+
The fastest BERT training record reported above was achieved using internal datasets, which were not publicly available at the time of this release. However, the DeepSpeed BERT model can also be pre-trained using publicly available datasets from [Nvidia](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT). Instructions for preparing the datasets are available [here](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT#dataset-guidelines). In addition, the following three files are provided in this repo to perform the complete pre-training of DeepSpeed BERT using the Nvidia datasets.
17+
18+
1. <code>ds_train_bert_nvidia_data_bsz64k_seq128.sh</code> script for phase 1 training
19+
2. <code>ds_train_bert_nvidia_data_bsz32k_seq512.sh</code> script for phase 2 training
20+
3. <code>bert_large_lamb_nvidia_data.json</code> for configuring the different parameters relating to the model, datasets, hyper-parameters, etc.
21+
22+
The scripts assume that the datasets are available in the path <code>/workspace/bert</code>. For reference, the default settings of these script and configuration files will pre-train the model to achieve EM/F1 finetuning scores of 83.57/90.62 on SQuAD.

bing_bert/bert_dataset_provider.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
class BertDatasetProviderInterface:
2+
def get_shard(self, index, shuffle=True):
3+
raise NotImplementedError
4+
5+
def release_shard(self, index):
6+
raise NotImplementedError
7+
8+
def prefetch_shard(self, index):
9+
raise NotImplementedError
10+
11+
def get_batch(self, batch_iter):
12+
raise NotImplementedError
13+
14+
def prefetch_batch(self):
15+
raise NotImplementedError
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
{
2+
"name": "bing_bert_large_lamb_seq",
3+
"bert_token_file": "bert-large-uncased",
4+
"bert_model_file": "bert-large-uncased",
5+
"bert_model_config": {
6+
"vocab_size_or_config_json_file": 119547,
7+
"hidden_size": 1024,
8+
"num_hidden_layers": 24,
9+
"num_attention_heads": 16,
10+
"intermediate_size": 4096,
11+
"hidden_act": "gelu",
12+
"hidden_dropout_prob": 0.1,
13+
"attention_probs_dropout_prob": 0.1,
14+
"max_position_embeddings": 512,
15+
"type_vocab_size": 2,
16+
"initializer_range": 0.02
17+
},
18+
"data": {
19+
"flags": {
20+
"pretrain_dataset": true,
21+
"pretrain_type": "wiki_bc"
22+
},
23+
"mixed_seq_datasets": {
24+
"128": {
25+
"pretrain_dataset": "data/128"
26+
},
27+
"512": {
28+
"pretrain_dataset": "data/512"
29+
}
30+
}
31+
},
32+
"mixed_seq_training": {
33+
"128": {
34+
"num_epochs": 16,
35+
"warmup_proportion": 0.06,
36+
"learning_rate": 11e-3,
37+
"num_workers": 4,
38+
"async_worker": true,
39+
"decay_rate": 0.90,
40+
"decay_step": 250,
41+
"total_training_steps": 7500
42+
},
43+
"512": {
44+
"num_epochs": 20,
45+
"warmup_proportion": 0.02,
46+
"learning_rate": 2e-3,
47+
"num_workers": 4,
48+
"async_worker": true,
49+
"decay_rate": 0.90,
50+
"decay_step": 150,
51+
"total_training_steps": 7500
52+
}
53+
},
54+
"validation": {
55+
"path": "validation_set/"
56+
}
57+
}
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import os
2+
import random
3+
4+
import torch.distributed as dist
5+
from torch.utils.data import DataLoader, Dataset
6+
from torch.utils.data.sampler import RandomSampler, SequentialSampler
7+
from torch.utils.data.distributed import DistributedSampler
8+
9+
from bert_dataset_provider import BertDatasetProviderInterface
10+
from turing.dataset import PreTrainingDataset, PretrainDataType
11+
from data_worker import AsyncWorker
12+
13+
14+
class BingBertDatasetProvider(BertDatasetProviderInterface):
15+
def __init__(self, args):
16+
self.tokenizer = args.tokenizer
17+
self.refresh_bucket_size = args.refresh_bucket_size
18+
self.datasampler = RandomSampler if args.local_rank == -1 else DistributedSampler
19+
self.num_workers = args.config['training']['num_workers']
20+
21+
# Initialize dataset paths
22+
self.dataset_paths = []
23+
for dataset in ['wiki_pretrain_dataset', 'bc_pretrain_dataset']:
24+
self.dataset_paths.append(
25+
os.path.join(args.data_path_prefix,
26+
args.config["data"]["datasets"][dataset]))
27+
28+
self.max_seq_length = args.max_seq_length
29+
self.max_predictions_per_seq = args.max_predictions_per_seq
30+
31+
self.gradient_accumulation_steps = args.gradient_accumulation_steps
32+
self.train_micro_batch_size_per_gpu = args.train_micro_batch_size_per_gpu
33+
self.local_rank = args.local_rank
34+
self.global_rank = dist.get_rank()
35+
self.world_size = 1 if self.local_rank == -1 else dist.get_world_size()
36+
self.logger = args.logger
37+
38+
self.dataloaders = {}
39+
self.dataset_iterator = []
40+
41+
# Configure asynchronous data loading
42+
self.async_dataloading = 'async_worker' in args.config['training']
43+
self.async_worker = None
44+
45+
if self.global_rank == 0:
46+
self.logger.info(
47+
f"BingBertDatasetProvider - Initialization: async data loading {self.async_dataloading}"
48+
)
49+
50+
def get_shard(self, index, shuffle=True):
51+
datalengths = []
52+
batches_per_dataset = []
53+
54+
for i, dataset_path in enumerate(self.dataset_paths):
55+
pretrain_dataset = PreTrainingDataset(
56+
tokenizer=self.tokenizer,
57+
folder=dataset_path,
58+
logger=self.logger,
59+
max_seq_length=self.max_seq_length,
60+
index=index,
61+
data_type=PretrainDataType.NUMPY,
62+
max_predictions_per_seq=self.max_predictions_per_seq)
63+
64+
datalengths.append(len(pretrain_dataset))
65+
batches_per_dataset.append(
66+
self._get_effective_batch(len(pretrain_dataset)))
67+
self.dataloaders[i] = self._get_dataloader(pretrain_dataset)
68+
69+
dataset_batches = []
70+
for i, batch_count in enumerate(batches_per_dataset):
71+
dataset_batches.extend([i] * batch_count)
72+
73+
# shuffle
74+
if shuffle:
75+
random.shuffle(dataset_batches)
76+
77+
self.dataset_iterator = []
78+
for dataset_batch_type in dataset_batches:
79+
self.dataset_iterator.extend([dataset_batch_type] *
80+
self.gradient_accumulation_steps *
81+
self.refresh_bucket_size)
82+
83+
if self.async_dataloading:
84+
self.async_worker = AsyncWorker(self.dataloaders,
85+
self.dataset_iterator)
86+
self.async_worker.start()
87+
88+
return self.dataset_iterator, sum(datalengths)
89+
90+
def release_shard(self, index):
91+
if self.async_dataloading:
92+
self.async_worker.stop()
93+
94+
def prefetch_shard(self, index):
95+
pass
96+
97+
def get_batch(self, batch_iter):
98+
if self.async_dataloading:
99+
return self.async_worker.get()
100+
return next(self.dataloaders[batch_iter])
101+
102+
def prefetch_batch(self):
103+
if self.async_dataloading:
104+
self.async_worker.prefetch()
105+
106+
def _get_dataloader(self, dataset: Dataset):
107+
return (
108+
x
109+
for x in DataLoader(dataset,
110+
batch_size=self.train_micro_batch_size_per_gpu,
111+
sampler=self.datasampler(dataset),
112+
num_workers=self.num_workers))
113+
114+
def _get_effective_batch(self, total):
115+
return total // self.world_size // self.train_micro_batch_size_per_gpu // self.gradient_accumulation_steps // self.refresh_bucket_size

0 commit comments

Comments
 (0)