|
| 1 | +#!/bin/bash |
| 2 | +# Copyright 2013-2014 Johns Hopkins University (authors: Yenda Trmal, Daniel Povey) |
| 3 | + |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 11 | +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED |
| 12 | +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, |
| 13 | +# MERCHANTABLITY OR NON-INFRINGEMENT. |
| 14 | +# See the Apache 2 License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +#Simple utility script to convert the gzipped ARPA lm into a G.fst file |
| 18 | + |
| 19 | + |
| 20 | +oov_prob_file= |
| 21 | +unk_fraction= |
| 22 | +cleanup=true |
| 23 | +#end configuration section. |
| 24 | + |
| 25 | + |
| 26 | + |
| 27 | +echo $0 $@ |
| 28 | + |
| 29 | +[ -f ./path.sh ] && . ./path.sh |
| 30 | +[ -f ./cmd.sh ] && . ./cmd.sh |
| 31 | +. parse_options.sh || exit 1; |
| 32 | + |
| 33 | +if [ $# -ne 3 ]; then |
| 34 | + echo "Usage: $0 [options] <arpa-lm-file> <lang-dir> <dest-dir>" |
| 35 | + echo "Options: --oov-prob-file <oov-prob-file> # e.g. data/local/oov2prob" |
| 36 | + echo " # with this option it will replace <unk> with OOVs in G.fst." |
| 37 | + exit 1; |
| 38 | +fi |
| 39 | + |
| 40 | +set -e #Exit on non-zero return code from any command |
| 41 | +set -o pipefail #Exit if any of the commands in the pipeline will |
| 42 | + #return non-zero return code |
| 43 | + |
| 44 | +lmfile=$1 |
| 45 | +langdir=$2 |
| 46 | +destdir=$3 |
| 47 | + |
| 48 | +mkdir $destdir 2>/dev/null || true |
| 49 | + |
| 50 | + |
| 51 | +if [ ! -z "$oov_prob_file" ]; then |
| 52 | + if [ ! -s "$oov_prob_file" ]; then |
| 53 | + echo "$0: oov-prob file $oov_prob_file does not exist" |
| 54 | + exit 1; |
| 55 | + fi |
| 56 | + if [ -z "$unk_fraction" ]; then |
| 57 | + echo "--oov-prob option requires --unk-fraction option"; |
| 58 | + exit 1; |
| 59 | + fi |
| 60 | + |
| 61 | + min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0; |
| 62 | + while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; } |
| 63 | + if ($order == 1) { @A = split; |
| 64 | + if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob') |
| 65 | + echo "Minimum prob in LM file is $min_prob" |
| 66 | + |
| 67 | + echo "$0: creating LM file with unk words, using $oov_prob_file, in $destdir/lm_tmp.gz" |
| 68 | + gunzip -c $lmfile | \ |
| 69 | + perl -e ' ($oov_prob_file,$min_prob,$unk_fraction) = @ARGV; $ceilinged=0; |
| 70 | + $min_prob < 0.0 || die "Bad min_prob"; # this is a log-prob |
| 71 | + $unk_fraction > 0.0 || die "Bad unk_fraction"; # this is a prob |
| 72 | + open(F, "<$oov_prob_file") || die "opening oov file"; |
| 73 | + while (<F>) { push @OOVS, $_; } |
| 74 | + $num_oovs = @F; |
| 75 | + while(<STDIN>) { |
| 76 | + if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; } |
| 77 | + else { print; } # print all lines unchanged except the one that says ngram 1=X. |
| 78 | + if (m/^\\1-grams:$/) { |
| 79 | + foreach $l (@OOVS) { |
| 80 | + @A = split(" ", $l); |
| 81 | + @A == 2 || die "bad line in oov2prob: $_;"; |
| 82 | + ($word, $prob) = @A; |
| 83 | + $log10prob = (log($prob * $unk_fraction) / log(10.0)); |
| 84 | + if ($log10prob > $min_prob) { $log10prob = $min_prob; $ceilinged++;} |
| 85 | + print "$log10prob $word\n"; |
| 86 | + } |
| 87 | + }} print STDERR "Ceilinged $ceilinged unk-probs\n";' \ |
| 88 | + $oov_prob_file $min_prob $unk_fraction | gzip -c > $destdir/lm_tmp.gz |
| 89 | + lmfile=$destdir/lm_tmp.gz |
| 90 | +fi |
| 91 | + |
| 92 | +if [[ $lmfile == *.bz2 ]] ; then |
| 93 | + decompress="bunzip2 -c $lmfile" |
| 94 | +elif [[ $lmfile == *.gz ]] ; then |
| 95 | + decompress="gunzip -c $lmfile" |
| 96 | +else |
| 97 | + decompress="cat $lmfile" |
| 98 | +fi |
| 99 | + |
| 100 | +$decompress | \ |
| 101 | + grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \ |
| 102 | + arpa2fst - | \ |
| 103 | + fstprint | \ |
| 104 | + utils/eps2disambig.pl | \ |
| 105 | + utils/s2eps.pl | \ |
| 106 | + fstcompile --isymbols=$langdir/words.txt \ |
| 107 | + --osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \ |
| 108 | + fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1 |
| 109 | +fstisstochastic $destdir/G.fst || true; |
| 110 | + |
| 111 | +if $cleanup; then |
| 112 | + rm $destdir/lm_tmp.gz 2>/dev/null || true; |
| 113 | +fi |
| 114 | + |
| 115 | +exit 0 |
0 commit comments