11// bin/compute-gop.cc
22
33// Copyright 2019 Junbo Zhang
4+ // 2024 Jiun-Ting Li (National Taiwan Normal University)
45
56// See ../../COPYING for clarification regarding multiple authors
67//
@@ -107,11 +108,14 @@ int main(int argc, char *argv[]) {
107108 const char *usage =
108109 " Compute Goodness Of Pronunciation (GOP) from a matrix of "
109110 " probabilities (e.g. from nnet3-compute).\n "
110- " Usage: compute-gop [options] <model> <alignments-rspecifier> "
111+ " Usage: compute-gop [options] <model> "
112+ " <transition-alignments-respecifier> "
113+ " <phoneme-alignments-rspecifier> "
111114 " <prob-matrix-rspecifier> <gop-wspecifier> "
112- " [ <phone-feature-wspecifier>] \n "
115+ " <phone-feature-wspecifier>\n "
113116 " e.g.:\n "
114- " nnet3-compute [args] | compute-gop 1.mdl ark:ali-phone.1 ark:-"
117+ " nnet3-compute [args] | compute-gop 1.mdl ark:ali.1 ark:ali-phone.1 "
118+ " ark:output.1.ark "
115119 " ark:gop.1 ark:phone-feat.1\n " ;
116120
117121 ParseOptions po (usage);
@@ -130,16 +134,17 @@ int main(int argc, char *argv[]) {
130134
131135 po.Read (argc, argv);
132136
133- if (po.NumArgs () != 4 && po. NumArgs () != 5 ) {
137+ if (po.NumArgs () != 6 ) {
134138 po.PrintUsage ();
135139 exit (1 );
136140 }
137141
138142 std::string model_filename = po.GetArg (1 ),
139- alignments_rspecifier = po.GetArg (2 ),
140- prob_rspecifier = po.GetArg (3 ),
141- gop_wspecifier = po.GetArg (4 ),
142- feat_wspecifier = po.GetArg (5 );
143+ transition_alignments_rspecifier = po.GetArg (2 ),
144+ phoneme_alignments_rspecifier = po.GetArg (3 ),
145+ prob_rspecifier = po.GetArg (4 ),
146+ gop_wspecifier = po.GetArg (5 ),
147+ feat_wspecifier = po.GetArg (6 );
143148
144149 TransitionModel trans_model;
145150 {
@@ -174,33 +179,50 @@ int main(int argc, char *argv[]) {
174179 }
175180 }
176181
177- RandomAccessInt32VectorReader alignment_reader (alignments_rspecifier);
182+ RandomAccessInt32VectorReader phoneme_alignments_reader (phoneme_alignments_rspecifier);
183+ RandomAccessInt32VectorReader transition_alignments_reader (transition_alignments_rspecifier);
178184 SequentialBaseFloatMatrixReader prob_reader (prob_rspecifier);
179185 PosteriorWriter gop_writer (gop_wspecifier);
180186 BaseFloatVectorWriter feat_writer (feat_wspecifier);
181187
182188 int32 num_done = 0 ;
183189 for (; !prob_reader.Done (); prob_reader.Next ()) {
184190 std::string key = prob_reader.Key ();
185- if (!alignment_reader .HasKey (key)) {
186- KALDI_WARN << " No alignment for utterance " << key;
191+ if (!phoneme_alignments_reader .HasKey (key)) {
192+ KALDI_WARN << " No phoneme alignment for utterance " << key;
187193 continue ;
188194 }
189- auto alignment = alignment_reader.Value (key);
195+ if (!transition_alignments_reader.HasKey (key)) {
196+ KALDI_WARN << " No transition alignment for utterance " << key;
197+ continue ;
198+ }
199+ auto phoneme_alignment = phoneme_alignments_reader.Value (key);
200+ auto transition_alignment = transition_alignments_reader.Value (key);
190201 Matrix<BaseFloat> &probs = prob_reader.Value ();
191202 if (log_applied) probs.ApplyExp ();
192203
204+ std::vector<std::vector<int32> > split;
205+ SplitToPhones (trans_model, transition_alignment, &split);
206+
207+ std::vector<int32> phone_boundary;
208+ for (int32 i = 0 ; i < split.size (); i++) {
209+ for (int32 j = 0 ; j < split[i].size (); j++) {
210+ phone_boundary.push_back (i);
211+ }
212+ }
213+
193214 Matrix<BaseFloat> lpps;
194215 ComputeLpps (probs, pdf2phones, &lpps);
195216
196- int32 frame_num = alignment .size ();
197- if (alignment .size () != probs.NumRows ()) {
217+ int32 frame_num = phoneme_alignment .size ();
218+ if (phoneme_alignment .size () != probs.NumRows ()) {
198219 KALDI_WARN << " The frame numbers of alignment and prob are not equal." ;
199220 if (frame_num > probs.NumRows ()) frame_num = probs.NumRows ();
200221 }
201222
202223 KALDI_ASSERT (frame_num > 0 );
203- int32 cur_phone_id = alignment[0 ];
224+ int32 cur_phone_id = phoneme_alignment[0 ];
225+ int32 cur_phone_pos = phone_boundary[0 ];
204226 int32 duration = 0 ;
205227 Vector<BaseFloat> phone_level_feat (1 + phone_num * 2 ); // [phone LPPs LPRs]
206228 SubVector<BaseFloat> lpp_part (phone_level_feat, 1 , phone_num);
@@ -220,8 +242,9 @@ int main(int argc, char *argv[]) {
220242 lpp_part.AddVec (1 , frame_level_lpp);
221243 duration++;
222244
223- int32 next_phone_id = (i < frame_num - 1 ) ? alignment[i + 1 ]: -1 ;
224- if (next_phone_id != cur_phone_id) {
245+ int32 next_phone_id = (i < frame_num - 1 ) ? phoneme_alignment[i + 1 ]: -1 ;
246+ int32 next_phone_pos = (i < frame_num - 1 ) ? phone_boundary[i + 1 ]: -1 ;
247+ if (next_phone_pos != cur_phone_pos) {
225248 int32 phone_id = phone_map.empty () ? cur_phone_id : phone_map[cur_phone_id];
226249
227250 // The current phone's feature have been ready
@@ -248,6 +271,7 @@ int main(int argc, char *argv[]) {
248271 duration = 0 ;
249272 }
250273 cur_phone_id = next_phone_id;
274+ cur_phone_pos = next_phone_pos;
251275 }
252276
253277 // Write GOPs and the GOP-based features
0 commit comments