Skip to content

Commit 7808f57

Browse files
committed
debug: 1
1 parent c2546fc commit 7808f57

File tree

1 file changed

+12
-0
lines changed

1 file changed

+12
-0
lines changed

src/libtorchaudio/forced_align/gpu/compute.cu

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#include <cub/cub.cuh>
77
#include <limits.h>
8+
#include <iostream>
89

910
namespace {
1011
constexpr int kNumThreads =
@@ -119,6 +120,7 @@ void forced_align_impl(
119120
const Tensor& targets,
120121
const int64_t blank,
121122
Tensor& paths) {
123+
std::cout << "forced_align_impl: entering" << std::endl;
122124
auto defaultStream = at::cuda::getCurrentCUDAStream();
123125
auto cpuDataTranferStream = at::cuda::getStreamFromPool();
124126
const scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
@@ -132,23 +134,28 @@ void forced_align_impl(
132134
const int L = targets.size(1); // label length
133135
const int S = 2 * L + 1;
134136

137+
std::cout << "forced_align_impl: 1" << std::endl;
135138
auto targetsCpu = torchaudio::stable::cpu(targets);
136139
// backPtrBuffer stores the index offset fthe best path at current position
137140
// We copy the values to CPU after running every kBackPtrBufferSize of
138141
// frames.
142+
std::cout << "forced_align_impl: 2" << std::endl;
139143
Tensor backPtrBuffer = torch::stable::new_empty(logProbs, {min(kBackPtrBufferSize, T), S}, ScalarType::Char);
140144
torch::stable::fill_(backPtrBuffer, -1);
141145

146+
std::cout << "forced_align_impl: 3" << std::endl;
142147
Tensor backPtrCpu = torch::stable::new_empty(targetsCpu, {T, S}, ScalarType::Char);
143148
torch::stable::fill_(backPtrCpu, -1);
144149

145150
// we store only two time frames for alphas
146151
// alphas for compute current timeframe can be computed only from previous
147152
// time frame.
153+
std::cout << "forced_align_impl: 4" << std::endl;
148154
Tensor alphas = torch::stable::new_empty(logProbs, {2, S});
149155
torch::stable::fill_(alphas, kNegInfinity);
150156

151157
// CPU accessors
158+
std::cout << "forced_align_impl: 5" << std::endl;
152159
auto targetsCpu_a = torchaudio::stable::accessor<target_t, 2>(targetsCpu);
153160
auto backPtrCpu_a = torchaudio::stable::accessor<int8_t, 2>(backPtrCpu);
154161
// count the number of repeats in label
@@ -170,6 +177,7 @@ void forced_align_impl(
170177
int end = (S == 1) ? 1 : 2;
171178
int backPtrBufferLen = 0;
172179
Tensor bufferCopy;
180+
std::cout << "forced_align_impl: 6" << std::endl;
173181
for (int t = 0; t < T; ++t) {
174182
if (t > 0) {
175183
if (T - t <= L + R) {
@@ -189,6 +197,7 @@ void forced_align_impl(
189197
end = end + 1;
190198
}
191199
}
200+
std::cout << "forced_align_impl: t=" << t << std::endl;
192201
falign_cuda_step_kernel<scalar_t, target_t>
193202
<<<1, kNumThreads, 0, defaultStream>>>(
194203
packed_accessor32<scalar_t, 3>(logProbs),
@@ -227,6 +236,7 @@ void forced_align_impl(
227236
backPtrBufferLen = 0;
228237
}
229238
}
239+
std::cout << "forced_align_impl: 7" << std::endl;
230240
cpuDataTranferStream.synchronize();
231241
auto alphasCpu = torchaudio::stable::cpu(alphas);
232242
auto alphasCpu_a = torchaudio::stable::accessor<scalar_t, 2>(alphasCpu);
@@ -235,12 +245,14 @@ void forced_align_impl(
235245
alphasCpu_a[curIdxOffset][S - 1] > alphasCpu_a[curIdxOffset][S - 2]
236246
? S - 1
237247
: S - 2;
248+
std::cout << "forced_align_impl: 8" << std::endl;
238249
for (int t = T - 1; t >= 0; --t) {
239250
auto lbl_idx =
240251
ltrIdx % 2 == 0 ? blank : targetsCpu_a[batchIndex][ltrIdx / 2];
241252
paths_a[batchIndex][t] = lbl_idx;
242253
ltrIdx -= backPtrCpu_a[t][ltrIdx];
243254
}
255+
std::cout << "forced_align_impl: leaving" << std::endl;
244256
}
245257

246258
template <typename scalar_t>

0 commit comments

Comments
 (0)