Skip to content

Commit e8d879e

Browse files
mosheislandMoshe Islandtjruwase
authored
deepspeed-chat: add end-of-text special token (deepspeedai#775)
Stages 1 & 2 append '<|endoftext|>' text marker to all samples. However, some tokenizers (e.g. OPT, Bloom), encode this marker as a sequence of subword tokens and not as a single special token. This commit adds an optional support to add the EOT marker as a special token to force the tokenizer to encode it as a single token. Note that using EOT special token may change the dynamics of stage3 training. Therefore, to be backward compliant, this commit makes it optional. Change-Id: If98d348fcaa7d6685e755aabe305e23e7649c367 Signed-off-by: Moshe Island <misland@habana.ai> Co-authored-by: Moshe Island <misland@habana.ai> Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
1 parent f7ff9dd commit e8d879e

6 files changed

Lines changed: 80 additions & 22 deletions

File tree

applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,11 @@ def parse_args():
191191
parser.add_argument('--tensorboard_path',
192192
type=str,
193193
default="step1_tensorboard")
194+
## Tokenizer
195+
parser.add_argument(
196+
"--add_eot_token",
197+
action='store_true',
198+
help="Add <|endoftext|> as additional special token to tokenizer")
194199
## Print loss
195200
parser.add_argument('--print_loss',
196201
action='store_true',
@@ -233,7 +238,12 @@ def main():
233238
torch.distributed.barrier()
234239

235240
# load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family
236-
tokenizer = load_hf_tokenizer(args.model_name_or_path, fast_tokenizer=True)
241+
args.end_of_conversation_token = "<|endoftext|>"
242+
additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
243+
tokenizer = load_hf_tokenizer(args.model_name_or_path,
244+
fast_tokenizer=True,
245+
add_special_tokens=additional_special_tokens)
246+
237247
model = create_hf_model(AutoModelForCausalLM,
238248
args.model_name_or_path,
239249
tokenizer,

applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ def parse_args():
7474
type=str,
7575
default="English",
7676
choices=["English", "Chinese", "Japanese"])
77+
parser.add_argument(
78+
"--add_eot_token",
79+
action='store_true',
80+
help="Add <|endoftext|> as additional special token to tokenizer")
7781

7882
args = parser.parse_args()
7983

@@ -197,8 +201,11 @@ def main():
197201

198202
device = torch.device(get_accelerator().device_name(0))
199203

204+
args.end_of_conversation_token = "<|endoftext|>"
205+
additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
200206
tokenizer = load_hf_tokenizer(args.model_name_or_path_baseline,
201-
fast_tokenizer=True)
207+
fast_tokenizer=True,
208+
add_special_tokens=additional_special_tokens)
202209

203210
model_baseline = create_hf_model(AutoModelForCausalLM,
204211
args.model_name_or_path_baseline,

applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,11 @@ def parse_args():
201201
parser.add_argument('--tensorboard_path',
202202
type=str,
203203
default="step2_tensorboard")
204+
## Tokenizer
205+
parser.add_argument(
206+
"--add_eot_token",
207+
action='store_true',
208+
help="Add <|endoftext|> as additional special token to tokenizer")
204209
parser = deepspeed.add_config_arguments(parser)
205210
args = parser.parse_args()
206211

@@ -238,7 +243,11 @@ def main():
238243
torch.distributed.barrier()
239244

240245
# load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family
241-
tokenizer = load_hf_tokenizer(args.model_name_or_path, fast_tokenizer=True)
246+
args.end_of_conversation_token = "<|endoftext|>"
247+
additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
248+
tokenizer = load_hf_tokenizer(args.model_name_or_path,
249+
fast_tokenizer=True,
250+
add_special_tokens=additional_special_tokens)
242251
rm_model = create_critic_model(args.model_name_or_path,
243252
tokenizer,
244253
ds_config,

applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,20 @@ def parse_args():
3535
"OPT model has a fixed number (1) of padding tokens at the beginning of the input. "
3636
"We did not see this in other models but keep it as an option for now.",
3737
)
38+
parser.add_argument(
39+
"--add_eot_token",
40+
action='store_true',
41+
help="Add <|endoftext|> as additional special token to tokenizer")
3842
args = parser.parse_args()
3943
return args
4044

4145

42-
def load_stuff(model_name_or_path, num_padding_at_beginning):
46+
def load_stuff(model_name_or_path, num_padding_at_beginning,
47+
additional_special_tokens):
4348

44-
tokenizer = load_hf_tokenizer(model_name_or_path, fast_tokenizer=True)
49+
tokenizer = load_hf_tokenizer(model_name_or_path,
50+
fast_tokenizer=True,
51+
add_special_tokens=additional_special_tokens)
4552
tokenizer.pad_token = tokenizer.eos_token
4653
model = create_critic_model(model_name_or_path,
4754
tokenizer,
@@ -106,8 +113,12 @@ def run_pair_comparison():
106113

107114
device = torch.device(get_accelerator().device_name(0))
108115

116+
args.end_of_conversation_token = "<|endoftext|>"
117+
additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
118+
109119
rm_model, tokenizer = load_stuff(args.model_name_or_path,
110-
args.num_padding_at_beginning)
120+
args.num_padding_at_beginning,
121+
additional_special_tokens)
111122
rm_model.to(device)
112123
rm_model.eval()
113124

@@ -126,12 +137,13 @@ def run_pair_comparison():
126137

127138
for prompt, good_ans, bad_ans in zip(prompt_list, good_ans_list,
128139
bad_ans_list):
129-
batch = prepare_datapair(prompt,
130-
good_ans,
131-
bad_ans,
132-
tokenizer,
133-
max_seq_len=512,
134-
end_of_conversation_token="<|endoftext|>")
140+
batch = prepare_datapair(
141+
prompt,
142+
good_ans,
143+
bad_ans,
144+
tokenizer,
145+
max_seq_len=512,
146+
end_of_conversation_token=args.end_of_conversation_token)
135147
batch = to_device(batch, device)
136148
# Run inference
137149
with torch.no_grad():
@@ -150,18 +162,23 @@ def run_single_sample():
150162
args = parse_args()
151163
device = torch.device(get_accelerator().device_name())
152164

165+
args.end_of_conversation_token = "<|endoftext|>"
166+
additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
167+
153168
rm_model, tokenizer = load_stuff(args.model_name_or_path,
154-
args.num_padding_at_beginning)
169+
args.num_padding_at_beginning,
170+
additional_special_tokens)
155171
rm_model.to(device)
156172

157173
prompt = "Human: Explain the moon landing to a 6 year old in a few sentences."
158174
my_ans = "Assistant: The moon landing was a major milestone in the history of human exploration of the solar system. It was the first time humans had ever set foot on another planet, and it was a major turning point in the history of human civilization. The astronauts, Neil Armstrong, Buzz Aldrin, and Michael Collins, successfully landed the Apollo 11 spacecraft on the moon, marking the first time humans had ever set foot on another"
159175

160-
batch = prepare_singlesample(prompt,
161-
my_ans,
162-
tokenizer,
163-
max_seq_len=512,
164-
end_of_conversation_token="<|endoftext|>")
176+
batch = prepare_singlesample(
177+
prompt,
178+
my_ans,
179+
tokenizer,
180+
max_seq_len=512,
181+
end_of_conversation_token=args.end_of_conversation_token)
165182
batch = to_device(batch, device)
166183

167184
rm_model.eval()

applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,11 @@ def parse_args():
339339
parser.add_argument('--tensorboard_path',
340340
type=str,
341341
default="step3_tensorboard")
342+
## Tokenizer
343+
parser.add_argument(
344+
"--add_eot_token",
345+
action='store_true',
346+
help="Add <|endoftext|> as additional special token to tokenizer")
342347
## Actor/critic model overflow alignment
343348
parser.add_argument(
344349
'--align_overflow',
@@ -459,8 +464,12 @@ def main():
459464
torch.distributed.barrier()
460465

461466
# load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family
467+
args.end_of_conversation_token = "<|endoftext|>"
468+
additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
462469
tokenizer = load_hf_tokenizer(args.actor_model_name_or_path,
463-
fast_tokenizer=True)
470+
fast_tokenizer=True,
471+
add_special_tokens=additional_special_tokens)
472+
464473
prompt_train_dataloader, unsupervised_train_dataloader, num_total_iters = create_datasets(
465474
args=args, tokenizer=tokenizer, train_phase=3)
466475

@@ -479,8 +488,6 @@ def main():
479488
rlhf_engine.actor.optimizer.quantize_nontrainable_params()
480489
print_rank_0("Mixed Precision ZeRO++ enabled")
481490

482-
args.end_of_conversation_token = "<|endoftext|>"
483-
484491
ppo_trainer = DeepSpeedPPOTrainerUnsupervised if unsupervised_training_enabled else DeepSpeedPPOTrainer
485492
trainer = ppo_trainer(rlhf_engine, args)
486493

applications/DeepSpeed-Chat/training/utils/utils.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,9 @@ def get_tokenizer(model_name_or_path, fast_tokenizer=True):
7676
return tokenizer
7777

7878

79-
def load_hf_tokenizer(model_name_or_path, fast_tokenizer=True):
79+
def load_hf_tokenizer(model_name_or_path,
80+
fast_tokenizer=True,
81+
add_special_tokens=None):
8082
if os.path.exists(model_name_or_path):
8183
# Locally tokenizer loading has some issue, so we need to force download
8284
model_json = os.path.join(model_name_or_path, "config.json")
@@ -90,6 +92,12 @@ def load_hf_tokenizer(model_name_or_path, fast_tokenizer=True):
9092
tokenizer = get_tokenizer(model_name_or_path,
9193
fast_tokenizer=fast_tokenizer)
9294

95+
if add_special_tokens is not None:
96+
add_special_tokens = [add_special_tokens] if isinstance(add_special_tokens, str) \
97+
else add_special_tokens
98+
tokenizer.add_special_tokens(
99+
{'additional_special_tokens': add_special_tokens})
100+
93101
return tokenizer
94102

95103

0 commit comments

Comments
 (0)