55
66#include < cub/cub.cuh>
77#include < limits.h>
8+ #include < iostream>
89
910namespace {
1011constexpr int kNumThreads =
@@ -119,6 +120,7 @@ void forced_align_impl(
119120 const Tensor& targets,
120121 const int64_t blank,
121122 Tensor& paths) {
123+ std::cout << " forced_align_impl: entering" << std::endl;
122124 auto defaultStream = at::cuda::getCurrentCUDAStream ();
123125 auto cpuDataTranferStream = at::cuda::getStreamFromPool ();
124126 const scalar_t kNegInfinity = -std::numeric_limits<scalar_t >::infinity ();
@@ -132,23 +134,28 @@ void forced_align_impl(
132134 const int L = targets.size (1 ); // label length
133135 const int S = 2 * L + 1 ;
134136
137+ std::cout << " forced_align_impl: 1" << std::endl;
135138 auto targetsCpu = torchaudio::stable::cpu (targets);
136139 // backPtrBuffer stores the index offset fthe best path at current position
137140 // We copy the values to CPU after running every kBackPtrBufferSize of
138141 // frames.
142+ std::cout << " forced_align_impl: 2" << std::endl;
139143 Tensor backPtrBuffer = torch::stable::new_empty (logProbs, {min (kBackPtrBufferSize , T), S}, ScalarType::Char);
140144 torch::stable::fill_ (backPtrBuffer, -1 );
141145
146+ std::cout << " forced_align_impl: 3" << std::endl;
142147 Tensor backPtrCpu = torch::stable::new_empty (targetsCpu, {T, S}, ScalarType::Char);
143148 torch::stable::fill_ (backPtrCpu, -1 );
144149
145150 // we store only two time frames for alphas
146151 // alphas for compute current timeframe can be computed only from previous
147152 // time frame.
153+ std::cout << " forced_align_impl: 4" << std::endl;
148154 Tensor alphas = torch::stable::new_empty (logProbs, {2 , S});
149155 torch::stable::fill_ (alphas, kNegInfinity );
150156
151157 // CPU accessors
158+ std::cout << " forced_align_impl: 5" << std::endl;
152159 auto targetsCpu_a = torchaudio::stable::accessor<target_t , 2 >(targetsCpu);
153160 auto backPtrCpu_a = torchaudio::stable::accessor<int8_t , 2 >(backPtrCpu);
154161 // count the number of repeats in label
@@ -170,6 +177,7 @@ void forced_align_impl(
170177 int end = (S == 1 ) ? 1 : 2 ;
171178 int backPtrBufferLen = 0 ;
172179 Tensor bufferCopy;
180+ std::cout << " forced_align_impl: 6" << std::endl;
173181 for (int t = 0 ; t < T; ++t) {
174182 if (t > 0 ) {
175183 if (T - t <= L + R) {
@@ -189,6 +197,7 @@ void forced_align_impl(
189197 end = end + 1 ;
190198 }
191199 }
200+ std::cout << " forced_align_impl: t=" << t << std::endl;
192201 falign_cuda_step_kernel<scalar_t , target_t >
193202 <<<1 , kNumThreads , 0 , defaultStream>>> (
194203 packed_accessor32<scalar_t , 3 >(logProbs),
@@ -227,6 +236,7 @@ void forced_align_impl(
227236 backPtrBufferLen = 0 ;
228237 }
229238 }
239+ std::cout << " forced_align_impl: 7" << std::endl;
230240 cpuDataTranferStream.synchronize ();
231241 auto alphasCpu = torchaudio::stable::cpu (alphas);
232242 auto alphasCpu_a = torchaudio::stable::accessor<scalar_t , 2 >(alphasCpu);
@@ -235,12 +245,14 @@ void forced_align_impl(
235245 alphasCpu_a[curIdxOffset][S - 1 ] > alphasCpu_a[curIdxOffset][S - 2 ]
236246 ? S - 1
237247 : S - 2 ;
248+ std::cout << " forced_align_impl: 8" << std::endl;
238249 for (int t = T - 1 ; t >= 0 ; --t) {
239250 auto lbl_idx =
240251 ltrIdx % 2 == 0 ? blank : targetsCpu_a[batchIndex][ltrIdx / 2 ];
241252 paths_a[batchIndex][t] = lbl_idx;
242253 ltrIdx -= backPtrCpu_a[t][ltrIdx];
243254 }
255+ std::cout << " forced_align_impl: leaving" << std::endl;
244256}
245257
246258template <typename scalar_t >
0 commit comments