|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +. cmd.sh |
| 4 | + |
| 5 | +steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ |
| 6 | + data/train_30k_nodup data/lang exp/tri3b exp/tri3b_ali_30k_nodup || exit 1; |
| 7 | + |
| 8 | + |
| 9 | +steps/train_lda_mllt.sh --cmd "$train_cmd" --realign-iters "" \ |
| 10 | + 1000 10000 data/train_30k_nodup data/lang exp/tri3b_ali_30k_nodup exp/tri4b_seg || exit 1; |
| 11 | + |
| 12 | +# Make the phone decoding-graph. |
| 13 | +steps/make_phone_graph.sh data/lang exp/tri3b_ali_all exp/tri4b_seg || exit 1; |
| 14 | + |
| 15 | +mkdir -p data_reseg |
| 16 | + |
| 17 | +for data in train_orig eval2000; do |
| 18 | + cp -rT data/${data} data_reseg/${data}_orig; rm -r data_reseg/${data}_orig/split* |
| 19 | + for f in text utt2spk spk2utt feats.scp cmvn.scp segments; do rm data_reseg/${data}_orig/$f; done |
| 20 | + cat data_reseg/${data}_orig/wav.scp | awk '{print $1, $1;}' | \ |
| 21 | + tee data_reseg/${data}_orig/spk2utt > data_reseg/${data}_orig/utt2spk |
| 22 | + mfccdir=mfcc_reseg # don't use mfcc because of the way names are assigned within that |
| 23 | + # dir, we'll overwrite the old data. |
| 24 | + mkdir -p mfcc_reseg |
| 25 | + steps/make_mfcc.sh --compress true --nj 20 --cmd "$train_cmd" data_reseg/${data}_orig exp/make_mfcc/${data}_orig $mfccdir |
| 26 | + # caution: the new speakers don't correspond to the old ones, since they now have "sw0" at the start.. |
| 27 | + steps/compute_cmvn_stats.sh --two-channel data_reseg/${data}_orig exp/make_mfcc/${data}_orig $mfccdir |
| 28 | +done |
| 29 | + |
| 30 | + |
| 31 | + |
| 32 | +steps/decode_si_ali.sh --cmd "$decode_cmd" --nj 60 --beam 7.0 --max-active 1000 \ |
| 33 | + exp/tri4b_seg/phone_graph data_reseg/train_orig exp/tri4b_seg/decode_train_orig2 |
| 34 | + |
| 35 | +steps/decode_si_ali.sh --cmd "$decode_cmd" --nj 10 --beam 7.0 --max-active 1000 \ |
| 36 | + exp/tri4b_seg/phone_graph data_reseg/eval2000_orig exp/tri4b_seg/decode_eval2000_orig |
| 37 | + |
| 38 | + |
| 39 | +# Here: resegment. |
| 40 | +# Note: it would be perfectly possible to use exp/tri3b_ali_train here instead |
| 41 | +# of exp/tri4b_seg/decode_train_orig. In this case we'd be relying on the transcripts. |
| 42 | +# I chose not to do this for more consistency with what happens in test time. |
| 43 | + |
| 44 | +steps/resegment_data.sh --cmd "$train_cmd" data_reseg/train_orig data/lang \ |
| 45 | + exp/tri4b_seg/decode_train_orig data_reseg/train exp/tri4b_resegment_train |
| 46 | + |
| 47 | +steps/resegment_data.sh --cmd "$train_cmd" data_reseg/eval2000_orig data/lang \ |
| 48 | + exp/tri4b_seg/decode_eval2000_orig data_reseg/eval2000 exp/tri4b_resegment_eval2000 |
| 49 | + |
| 50 | +# We need all the training data to be aligned (not just "train_nodup"), in order |
| 51 | +# to get the resegmented "text". |
| 52 | +steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ |
| 53 | + data/train data/lang exp/tri3b exp/tri3b_ali_train || exit 1; |
| 54 | + |
| 55 | +# Get the file data_reseg/train/text |
| 56 | +steps/resegment_text.sh --cmd "$train_cmd" data/train data/lang \ |
| 57 | + exp/tri3b_ali_train data_reseg/train exp/tri4b_resegment_train |
| 58 | + |
| 59 | + |
| 60 | +for data in train eval2000; do |
| 61 | + utils/fix_data_dir.sh data_reseg/${data} |
| 62 | + utils/validate_data_dir.sh --no-feats --no-text data_reseg/${data} |
| 63 | + mfccdir=mfcc_reseg # don't use mfcc because of the way names are assigned within that |
| 64 | + # dir, we'll overwrite the old data. |
| 65 | + steps/make_mfcc.sh --compress true --nj 40 --cmd "$train_cmd" data_reseg/${data} \ |
| 66 | + exp/make_mfcc/${data}_reseg $mfccdir || exit 1; |
| 67 | + steps/compute_cmvn_stats.sh data_reseg/${data} exp/make_mfcc/${data}_reseg $mfccdir || exit 1; |
| 68 | + utils/fix_data_dir.sh data_reseg/${data} || exit 1; |
| 69 | +done |
| 70 | + |
| 71 | + |
| 72 | +# Note: we'll be comparing tri4b, which was trained on train_nodup, with tri4c_reseg, which |
| 73 | +# was trained on *all* the resegmented data. However, it's comparable because the actual hours |
| 74 | +# of data is less in tri4c_reseg: 265h, versus 284 in the nodup data. |
| 75 | +# cat data/train_nodup/segments | awk '{nf += $4 - $3; } END{print nf /3600;}' |
| 76 | +# 284.433 |
| 77 | +# cat data_reseg/train/segments | awk '{nf += $4 - $3; } END{print nf /3600;}' |
| 78 | +# 265.154 |
| 79 | + |
| 80 | +steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ |
| 81 | + data_reseg/train data/lang exp/tri3b exp/tri3b_ali_reseg || exit 1; |
| 82 | + |
| 83 | +steps/train_sat.sh --cmd "$train_cmd" \ |
| 84 | + 11500 200000 data_reseg/train data/lang exp/tri3b_ali_reseg exp/tri4c_reseg || exit 1; |
| 85 | + |
| 86 | + |
| 87 | +for lm_suffix in tg fsh_tgpr; do |
| 88 | + ( |
| 89 | + graph_dir=exp/tri4c_reseg/graph_sw1_${lm_suffix} |
| 90 | + $train_cmd $graph_dir/mkgraph.log \ |
| 91 | + utils/mkgraph.sh data/lang_sw1_${lm_suffix} exp/tri4c_reseg $graph_dir |
| 92 | + steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ |
| 93 | + $graph_dir data_reseg/eval2000 exp/tri4c_reseg/decode_eval2000_sw1_${lm_suffix} |
| 94 | + ) & |
| 95 | +done |
| 96 | + |
| 97 | + |
| 98 | +exit 0; |
| 99 | + |
| 100 | +# Below is experimental. |
| 101 | +# I'm figuring out whether we should keep the segments the the 1st pass designated as noise. |
| 102 | +steps/resegment_data.sh --cmd "$train_cmd" \ |
| 103 | + --segmentation-opts "--remove-noise-only-segments false" \ |
| 104 | + data_reseg/eval2000_orig data/lang \ |
| 105 | + exp/tri4b_seg/decode_eval2000_orig data_reseg/eval2000_with_noise exp/tri4b_resegment_eval2000_with_noise |
| 106 | + |
| 107 | +for data in eval2000_with_noise; do |
| 108 | + utils/fix_data_dir.sh data_reseg/${data} |
| 109 | + utils/validate_data_dir.sh --no-feats --no-text data_reseg/${data} |
| 110 | + mfccdir=mfcc_reseg # don't use mfcc because of the way names are assigned within that |
| 111 | + # dir, we'll overwrite the old data. |
| 112 | + steps/make_mfcc.sh --compress true --nj 40 --cmd "$train_cmd" data_reseg/${data} \ |
| 113 | + exp/make_mfcc/${data}_reseg $mfccdir || exit 1; |
| 114 | + steps/compute_cmvn_stats.sh data_reseg/${data} exp/make_mfcc/${data}_reseg $mfccdir || exit 1; |
| 115 | + utils/fix_data_dir.sh data_reseg/${data} || exit 1; |
| 116 | +done |
| 117 | + |
| 118 | +for lm_suffix in tg fsh_tgpr; do |
| 119 | + ( |
| 120 | + graph_dir=exp/tri4c_reseg/graph_sw1_${lm_suffix} |
| 121 | + $train_cmd $graph_dir/mkgraph.log \ |
| 122 | + utils/mkgraph.sh data/lang_sw1_${lm_suffix} exp/tri4c_reseg $graph_dir |
| 123 | + steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ |
| 124 | + $graph_dir data_reseg/eval2000_with_noise exp/tri4c_reseg/decode_eval2000_with_noise_sw1_${lm_suffix} |
| 125 | + ) & |
| 126 | +done |
0 commit comments