@@ -231,26 +231,26 @@ def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer
231231 train_path = get_wmt_enfr_train_set (data_dir )
232232 dev_path = get_wmt_enfr_dev_set (data_dir )
233233
234- from_train_path = train_path + ".input"
235- to_train_path = train_path + ".output"
236- from_dev_path = dev_path + ".input"
237- to_dev_path = dev_path + ".output"
238- return prepare_data (data_dir , from_train_path , to_train_path , from_dev_path , to_dev_path , en_vocabulary_size ,
234+ input_train_path = train_path + ".input"
235+ output_train_path = train_path + ".output"
236+ input_dev_path = dev_path + ".input"
237+ output_dev_path = dev_path + ".output"
238+ return prepare_data (data_dir , input_train_path , output_train_path , input_dev_path , output_dev_path , en_vocabulary_size ,
239239 fr_vocabulary_size , tokenizer )
240240
241241
242- def prepare_data (data_dir , from_train_path , to_train_path , from_dev_path , to_dev_path , from_vocabulary_size ,
243- to_vocabulary_size , tokenizer = None ):
242+ def prepare_data (data_dir , input_train_path , output_train_path , input_dev_path , output_dev_path , input_vocabulary_size ,
243+ output_vocabulary_size , tokenizer = None ):
244244 """Preapre all necessary files that are required for the training.
245245
246246 Args:
247247 data_dir: directory in which the data sets will be stored.
248- from_train_path : path to the file that includes "from" training samples.
249- to_train_path : path to the file that includes "to" training samples.
250- from_dev_path : path to the file that includes "from" dev samples.
251- to_dev_path : path to the file that includes "to" dev samples.
252- from_vocabulary_size : size of the "from language" vocabulary to create and use.
253- to_vocabulary_size : size of the "to language" vocabulary to create and use.
248+ input_train_path : path to the file that includes "from" training samples.
249+ output_train_path : path to the file that includes "to" training samples.
250+ input_dev_path : path to the file that includes "from" dev samples.
251+ output_dev_path : path to the file that includes "to" dev samples.
252+ input_vocabulary_size : size of the "from language" vocabulary to create and use.
253+ output_vocabulary_size : size of the "to language" vocabulary to create and use.
254254 tokenizer: a function to use to tokenize each data sentence;
255255 if None, basic_tokenizer will be used.
256256
@@ -264,23 +264,23 @@ def prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev
264264 (6) path to the "to language" vocabulary file.
265265 """
266266 # Create vocabularies of the appropriate sizes.
267- to_vocab_path = os .path .join (data_dir , "vocab%d.output" % to_vocabulary_size )
268- from_vocab_path = os .path .join (data_dir , "vocab%d.input" % from_vocabulary_size )
269- create_vocabulary (to_vocab_path , to_train_path , to_vocabulary_size , tokenizer )
270- create_vocabulary (from_vocab_path , from_train_path , from_vocabulary_size , tokenizer )
267+ output_vocab_path = os .path .join (data_dir , "vocab%d.output" % output_vocabulary_size )
268+ input_vocab_path = os .path .join (data_dir , "vocab%d.input" % input_vocabulary_size )
269+ create_vocabulary (output_vocab_path , output_train_path , output_vocabulary_size , tokenizer )
270+ create_vocabulary (input_vocab_path , input_train_path , input_vocabulary_size , tokenizer )
271271
272272 # Create token ids for the training data.
273- to_train_ids_path = to_train_path + (".ids%d" % to_vocabulary_size )
274- from_train_ids_path = from_train_path + (".ids%d" % from_vocabulary_size )
275- data_to_token_ids (to_train_path , to_train_ids_path , to_vocab_path , tokenizer )
276- data_to_token_ids (from_train_path , from_train_ids_path , from_vocab_path , tokenizer )
273+ output_train_ids_path = output_train_path + (".ids%d" % output_vocabulary_size )
274+ input_train_ids_path = input_train_path + (".ids%d" % input_vocabulary_size )
275+ data_to_token_ids (output_train_path , output_train_ids_path , output_vocab_path , tokenizer )
276+ data_to_token_ids (input_train_path , input_train_ids_path , input_vocab_path , tokenizer )
277277
278278 # Create token ids for the development data.
279- to_dev_ids_path = to_dev_path + (".ids%d" % to_vocabulary_size )
280- from_dev_ids_path = from_dev_path + (".ids%d" % from_vocabulary_size )
281- data_to_token_ids (to_dev_path , to_dev_ids_path , to_vocab_path , tokenizer )
282- data_to_token_ids (from_dev_path , from_dev_ids_path , from_vocab_path , tokenizer )
283-
284- return (from_train_ids_path , to_train_ids_path ,
285- from_dev_ids_path , to_dev_ids_path ,
286- from_vocab_path , to_vocab_path )
279+ output_dev_ids_path = output_dev_path + (".ids%d" % output_vocabulary_size )
280+ input_dev_ids_path = input_dev_path + (".ids%d" % input_vocabulary_size )
281+ data_to_token_ids (output_dev_path , output_dev_ids_path , output_vocab_path , tokenizer )
282+ data_to_token_ids (input_dev_path , input_dev_ids_path , input_vocab_path , tokenizer )
283+
284+ return (input_train_ids_path , output_train_ids_path ,
285+ input_dev_ids_path , output_dev_ids_path ,
286+ input_vocab_path , output_vocab_path )
0 commit comments