Skip to content

Commit 21fe287

Browse files
committed
Try to get a better timestamp match via libobs composition timestamps
1 parent fd391f7 commit 21fe287

File tree

3 files changed

+71
-9
lines changed

3 files changed

+71
-9
lines changed

src/transcription-filter-callbacks.cpp

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -255,14 +255,24 @@ void send_caption_to_webvtt(uint64_t possible_end_ts_ms, DetectionResultWithText
255255
if (!muxer)
256256
continue;
257257

258+
auto &anchor = output.time_anchors[i];
259+
if (!anchor.anchor)
260+
return;
261+
262+
auto output_start_ts = anchor.anchor->composition_timestamp / 1'000'000;
263+
if (possible_end_ts_ms < output_start_ts)
264+
return;
265+
258266
auto duration = result.end_timestamp_ms - result.start_timestamp_ms;
259-
auto segment_start_ts = possible_end_ts_ms - duration;
260-
if (segment_start_ts < output.start_timestamp_ms) {
261-
duration -= output.start_timestamp_ms - segment_start_ts;
262-
segment_start_ts = output.start_timestamp_ms;
267+
auto segment_start_ts = possible_end_ts_ms / 1'000'000 - duration;
268+
269+
if (segment_start_ts < output_start_ts) {
270+
duration -= output_start_ts - segment_start_ts;
271+
segment_start_ts = output_start_ts;
263272
}
273+
264274
webvtt_muxer_add_cue(muxer.get(), lang_to_track->second,
265-
segment_start_ts - output.start_timestamp_ms, duration,
275+
(segment_start_ts - output_start_ts), duration,
266276
str_copy.c_str());
267277
}
268278
}
@@ -500,8 +510,50 @@ void output_packet_added_callback(obs_output_t *output, struct encoder_packet *p
500510
if (!muxer)
501511
return;
502512

513+
auto &time_anchor = it->time_anchors[pkt->track_idx];
514+
if (!time_anchor.anchor) {
515+
// CTS can repeat if there are
516+
// 1) lagged frames (composition thread wasn't fast enough)
517+
// 2) duplicated frames (encoder couldn't keep up)
518+
// this is trying to find a frame that is neither lagged nor duplicated, to ensure a stable mapping from composition time to PTS
519+
auto new_end = std::remove_if(
520+
time_anchor.last_two_if_not_initialized.begin(),
521+
time_anchor.last_two_if_not_initialized.end(),
522+
[&](auto &val) { return val.composition_timestamp == pkt_time->cts; });
523+
if (new_end != time_anchor.last_two_if_not_initialized.end())
524+
time_anchor.last_two_if_not_initialized.erase(
525+
new_end, time_anchor.last_two_if_not_initialized.end());
526+
527+
if (time_anchor.last_two_if_not_initialized.size() == 2) {
528+
time_anchor.anchor = time_anchor.last_two_if_not_initialized.back();
529+
time_anchor.last_two_if_not_initialized.clear();
530+
} else {
531+
time_anchor.last_two_if_not_initialized.push_back({
532+
pkt->pts,
533+
pkt_time->cts,
534+
});
535+
}
536+
}
537+
538+
auto encoder = obs_output_get_video_encoder2(output, pkt->track_idx);
539+
if (!encoder)
540+
return;
541+
542+
auto video = obs_encoder_video(encoder);
543+
auto voi = video_output_get_info(video);
544+
if (!voi)
545+
return;
546+
547+
uint64_t packet_absolute_timestamp = 0;
548+
// time for subtitles only starts progressing once we have an anchor point
549+
if (time_anchor.anchor && time_anchor.anchor->pts <= pkt->pts) {
550+
packet_absolute_timestamp =
551+
util_mul_div64(1000000000ULL, voi->fps_den, voi->fps_num) *
552+
(pkt->pts - time_anchor.anchor->pts);
553+
}
554+
503555
std::unique_ptr<WebvttBuffer, webvtt_buffer_deleter> buffer{
504-
webvtt_muxer_try_mux_into_bytestream(muxer.get(), pkt_time->cts, pkt->keyframe,
556+
webvtt_muxer_try_mux_into_bytestream(muxer.get(), packet_absolute_timestamp, pkt->keyframe,
505557
it->codec_flavor[pkt->track_idx])};
506558

507559
if (!buffer)
@@ -547,7 +599,6 @@ void add_webvtt_output(transcription_filter_data &gf, obs_output_t *output,
547599
auto &entry = gf.active_outputs.back();
548600
entry.output = obs_output_get_weak_output(output);
549601
entry.output_type = output_type;
550-
entry.start_timestamp_ms = start_ms;
551602
obs_output_add_packet_callback_(output, output_packet_added_callback, &gf);
552603
}
553604

src/transcription-filter-data.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,15 +212,25 @@ struct transcription_filter_data {
212212
Recording,
213213
};
214214

215+
struct webvtt_to_video_timestamp {
216+
int64_t pts;
217+
uint64_t composition_timestamp;
218+
};
219+
220+
struct webvtt_video_time_anchor {
221+
std::deque<webvtt_to_video_timestamp> last_two_if_not_initialized;
222+
std::optional<webvtt_to_video_timestamp> anchor;
223+
};
224+
215225
struct webvtt_output {
216226
OBSWeakOutputAutoRelease output;
217227
webvtt_output_type output_type;
218-
uint64_t start_timestamp_ms;
219228

220229
bool initialized = false;
221230
std::map<std::string, uint8_t> language_to_track;
222231
std::unique_ptr<WebvttMuxer, webvtt_muxer_deleter>
223232
webvtt_muxer[MAX_OUTPUT_VIDEO_ENCODERS];
233+
webvtt_video_time_anchor time_anchors[MAX_OUTPUT_VIDEO_ENCODERS];
224234
CodecFlavor codec_flavor[MAX_OUTPUT_VIDEO_ENCODERS] = {};
225235
};
226236

src/whisper-utils/whisper-processing.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include <obs-module.h>
44

5+
#include <util/platform.h>
56
#include <util/profiler.hpp>
67

78
#include "plugin-support.h"
@@ -341,7 +342,7 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
341342
pcm32f_size * sizeof(float));
342343
}
343344

344-
auto inference_start_ts = now_ms();
345+
auto inference_start_ts = os_gettime_ns();
345346

346347
struct DetectionResultWithText inference_result =
347348
run_whisper_inference(gf, pcm32f_data, pcm32f_size_with_silence, start_offset_ms,

0 commit comments

Comments
 (0)