@@ -255,14 +255,24 @@ void send_caption_to_webvtt(uint64_t possible_end_ts_ms, DetectionResultWithText
255255 if (!muxer)
256256 continue ;
257257
258+ auto &anchor = output.time_anchors [i];
259+ if (!anchor.anchor )
260+ return ;
261+
262+ auto output_start_ts = anchor.anchor ->composition_timestamp / 1'000'000 ;
263+ if (possible_end_ts_ms < output_start_ts)
264+ return ;
265+
258266 auto duration = result.end_timestamp_ms - result.start_timestamp_ms ;
259- auto segment_start_ts = possible_end_ts_ms - duration;
260- if (segment_start_ts < output.start_timestamp_ms ) {
261- duration -= output.start_timestamp_ms - segment_start_ts;
262- segment_start_ts = output.start_timestamp_ms ;
267+ auto segment_start_ts = possible_end_ts_ms / 1'000'000 - duration;
268+
269+ if (segment_start_ts < output_start_ts) {
270+ duration -= output_start_ts - segment_start_ts;
271+ segment_start_ts = output_start_ts;
263272 }
273+
264274 webvtt_muxer_add_cue (muxer.get (), lang_to_track->second ,
265- segment_start_ts - output. start_timestamp_ms , duration,
275+ ( segment_start_ts - output_start_ts) , duration,
266276 str_copy.c_str ());
267277 }
268278 }
@@ -500,8 +510,50 @@ void output_packet_added_callback(obs_output_t *output, struct encoder_packet *p
500510 if (!muxer)
501511 return ;
502512
513+ auto &time_anchor = it->time_anchors [pkt->track_idx ];
514+ if (!time_anchor.anchor ) {
515+ // CTS can repeat if there are
516+ // 1) lagged frames (composition thread wasn't fast enough)
517+ // 2) duplicated frames (encoder couldn't keep up)
518+ // this is trying to find a frame that is neither lagged nor duplicated, to ensure a stable mapping from composition time to PTS
519+ auto new_end = std::remove_if (
520+ time_anchor.last_two_if_not_initialized .begin (),
521+ time_anchor.last_two_if_not_initialized .end (),
522+ [&](auto &val) { return val.composition_timestamp == pkt_time->cts ; });
523+ if (new_end != time_anchor.last_two_if_not_initialized .end ())
524+ time_anchor.last_two_if_not_initialized .erase (
525+ new_end, time_anchor.last_two_if_not_initialized .end ());
526+
527+ if (time_anchor.last_two_if_not_initialized .size () == 2 ) {
528+ time_anchor.anchor = time_anchor.last_two_if_not_initialized .back ();
529+ time_anchor.last_two_if_not_initialized .clear ();
530+ } else {
531+ time_anchor.last_two_if_not_initialized .push_back ({
532+ pkt->pts ,
533+ pkt_time->cts ,
534+ });
535+ }
536+ }
537+
538+ auto encoder = obs_output_get_video_encoder2 (output, pkt->track_idx );
539+ if (!encoder)
540+ return ;
541+
542+ auto video = obs_encoder_video (encoder);
543+ auto voi = video_output_get_info (video);
544+ if (!voi)
545+ return ;
546+
547+ uint64_t packet_absolute_timestamp = 0 ;
548+ // time for subtitles only starts progressing once we have an anchor point
549+ if (time_anchor.anchor && time_anchor.anchor ->pts <= pkt->pts ) {
550+ packet_absolute_timestamp =
551+ util_mul_div64 (1000000000ULL , voi->fps_den , voi->fps_num ) *
552+ (pkt->pts - time_anchor.anchor ->pts );
553+ }
554+
503555 std::unique_ptr<WebvttBuffer, webvtt_buffer_deleter> buffer{
504- webvtt_muxer_try_mux_into_bytestream (muxer.get (), pkt_time-> cts , pkt->keyframe ,
556+ webvtt_muxer_try_mux_into_bytestream (muxer.get (), packet_absolute_timestamp , pkt->keyframe ,
505557 it->codec_flavor [pkt->track_idx ])};
506558
507559 if (!buffer)
@@ -547,7 +599,6 @@ void add_webvtt_output(transcription_filter_data &gf, obs_output_t *output,
547599 auto &entry = gf.active_outputs .back ();
548600 entry.output = obs_output_get_weak_output (output);
549601 entry.output_type = output_type;
550- entry.start_timestamp_ms = start_ms;
551602 obs_output_add_packet_callback_ (output, output_packet_added_callback, &gf);
552603}
553604
0 commit comments