Skip to content

Commit fb01b11

Browse files
author
李闯
committed
add seq2seq_patch
1 parent 5a377da commit fb01b11

File tree

6 files changed

+202
-1329
lines changed

6 files changed

+202
-1329
lines changed

chatbotv4/data_utils.py

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -231,26 +231,26 @@ def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer
231231
train_path = get_wmt_enfr_train_set(data_dir)
232232
dev_path = get_wmt_enfr_dev_set(data_dir)
233233

234-
from_train_path = train_path + ".input"
235-
to_train_path = train_path + ".output"
236-
from_dev_path = dev_path + ".input"
237-
to_dev_path = dev_path + ".output"
238-
return prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, en_vocabulary_size,
234+
input_train_path = train_path + ".input"
235+
output_train_path = train_path + ".output"
236+
input_dev_path = dev_path + ".input"
237+
output_dev_path = dev_path + ".output"
238+
return prepare_data(data_dir, input_train_path, output_train_path, input_dev_path, output_dev_path, en_vocabulary_size,
239239
fr_vocabulary_size, tokenizer)
240240

241241

242-
def prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, from_vocabulary_size,
243-
to_vocabulary_size, tokenizer=None):
242+
def prepare_data(data_dir, input_train_path, output_train_path, input_dev_path, output_dev_path, input_vocabulary_size,
243+
output_vocabulary_size, tokenizer=None):
244244
"""Preapre all necessary files that are required for the training.
245245
246246
Args:
247247
data_dir: directory in which the data sets will be stored.
248-
from_train_path: path to the file that includes "from" training samples.
249-
to_train_path: path to the file that includes "to" training samples.
250-
from_dev_path: path to the file that includes "from" dev samples.
251-
to_dev_path: path to the file that includes "to" dev samples.
252-
from_vocabulary_size: size of the "from language" vocabulary to create and use.
253-
to_vocabulary_size: size of the "to language" vocabulary to create and use.
248+
input_train_path: path to the file that includes "from" training samples.
249+
output_train_path: path to the file that includes "to" training samples.
250+
input_dev_path: path to the file that includes "from" dev samples.
251+
output_dev_path: path to the file that includes "to" dev samples.
252+
input_vocabulary_size: size of the "from language" vocabulary to create and use.
253+
output_vocabulary_size: size of the "to language" vocabulary to create and use.
254254
tokenizer: a function to use to tokenize each data sentence;
255255
if None, basic_tokenizer will be used.
256256
@@ -264,23 +264,23 @@ def prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev
264264
(6) path to the "to language" vocabulary file.
265265
"""
266266
# Create vocabularies of the appropriate sizes.
267-
to_vocab_path = os.path.join(data_dir, "vocab%d.output" % to_vocabulary_size)
268-
from_vocab_path = os.path.join(data_dir, "vocab%d.input" % from_vocabulary_size)
269-
create_vocabulary(to_vocab_path, to_train_path , to_vocabulary_size, tokenizer)
270-
create_vocabulary(from_vocab_path, from_train_path , from_vocabulary_size, tokenizer)
267+
output_vocab_path = os.path.join(data_dir, "vocab%d.output" % output_vocabulary_size)
268+
input_vocab_path = os.path.join(data_dir, "vocab%d.input" % input_vocabulary_size)
269+
create_vocabulary(output_vocab_path, output_train_path , output_vocabulary_size, tokenizer)
270+
create_vocabulary(input_vocab_path, input_train_path , input_vocabulary_size, tokenizer)
271271

272272
# Create token ids for the training data.
273-
to_train_ids_path = to_train_path + (".ids%d" % to_vocabulary_size)
274-
from_train_ids_path = from_train_path + (".ids%d" % from_vocabulary_size)
275-
data_to_token_ids(to_train_path, to_train_ids_path, to_vocab_path, tokenizer)
276-
data_to_token_ids(from_train_path, from_train_ids_path, from_vocab_path, tokenizer)
273+
output_train_ids_path = output_train_path + (".ids%d" % output_vocabulary_size)
274+
input_train_ids_path = input_train_path + (".ids%d" % input_vocabulary_size)
275+
data_to_token_ids(output_train_path, output_train_ids_path, output_vocab_path, tokenizer)
276+
data_to_token_ids(input_train_path, input_train_ids_path, input_vocab_path, tokenizer)
277277

278278
# Create token ids for the development data.
279-
to_dev_ids_path = to_dev_path + (".ids%d" % to_vocabulary_size)
280-
from_dev_ids_path = from_dev_path + (".ids%d" % from_vocabulary_size)
281-
data_to_token_ids(to_dev_path, to_dev_ids_path, to_vocab_path, tokenizer)
282-
data_to_token_ids(from_dev_path, from_dev_ids_path, from_vocab_path, tokenizer)
283-
284-
return (from_train_ids_path, to_train_ids_path,
285-
from_dev_ids_path, to_dev_ids_path,
286-
from_vocab_path, to_vocab_path)
279+
output_dev_ids_path = output_dev_path + (".ids%d" % output_vocabulary_size)
280+
input_dev_ids_path = input_dev_path + (".ids%d" % input_vocabulary_size)
281+
data_to_token_ids(output_dev_path, output_dev_ids_path, output_vocab_path, tokenizer)
282+
data_to_token_ids(input_dev_path, input_dev_ids_path, input_vocab_path, tokenizer)
283+
284+
return (input_train_ids_path, output_train_ids_path,
285+
input_dev_ids_path, output_dev_ids_path,
286+
input_vocab_path, output_vocab_path)

chatbotv4/readme.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
python ./translate.py
33

44
预测:
5-
python translate.py --decode True
5+
python translate.py --decode

0 commit comments

Comments
 (0)