@@ -46,14 +46,20 @@ const (
4646//////////////////////////////////////////////////////////////////////////////
4747// LIFECYCLE
4848
49- // Create a new segmenter with a reader r which segments into raw audio of 'dur'
50- // length. If dur is zero then no segmenting is performed, the whole
51- // audio file is read and output in one go, which could cause some memory issues.
49+ // Create a new segmenter with a reader r which segments into raw audio.
5250// The sample rate is the number of samples per second.
5351//
52+ // Setting option WithSegmentSize will cause the segmenter to segment the audio
53+ // into fixed-size chunks approximately of the specified duration.
54+ //
55+ // Setting option WithDefaultSilenceThreshold will cause the segmenter to break
56+ // into smaller chunks, if silence is detected. The length of the silence is
57+ // specified by the WithSilenceDuration option, which defaults to 2 seconds.
58+ //
5459// At the moment, the audio format is auto-detected, but there should be
55- // a way to specify the audio format.
56- func NewReader (r io.Reader , dur time.Duration , sample_rate int , opts ... Opt ) (* Segmenter , error ) {
60+ // a way to specify the audio format. The output samples are always single-channel
61+ // (mono).
62+ func NewReader (r io.Reader , sample_rate int , opts ... Opt ) (* Segmenter , error ) {
5763 segmenter := new (Segmenter )
5864
5965 // Apply options
@@ -64,14 +70,14 @@ func NewReader(r io.Reader, dur time.Duration, sample_rate int, opts ...Opt) (*S
6470 }
6571
6672 // Check arguments
67- if dur < 0 || sample_rate <= 0 {
73+ if sample_rate <= 0 {
6874 return nil , media .ErrBadParameter .With ("invalid duration or sample rate arguments" )
6975 } else {
7076 segmenter .sample_rate = sample_rate
7177 }
7278
7379 // Sample buffer is duration * sample rate, assuming mono
74- segmenter .n = int (dur .Seconds () * float64 (sample_rate ))
80+ segmenter .n = int (segmenter . opts . SegmentSize .Seconds () * float64 (sample_rate ))
7581
7682 // Open the file
7783 media , err := ffmpeg .NewReader (r )
@@ -143,33 +149,20 @@ func (s *Segmenter) DecodeFloat32(ctx context.Context, fn SegmentFuncFloat32) er
143149 return nil
144150 }
145151
146- // Calculate the energy of the frame - root mean squared and normalize between 0 and 1
147- var sum float32
148- var energy float64
149- for _ , sample := range data {
150- sum += float32 (sample ) * float32 (sample )
151- }
152- energy = math .Sqrt (float64 (sum )/ float64 (len (data ))) / float64 (math .MaxInt16 )
153-
154- // If silence detection is enabled, check if the energy is below the threshold
155- var cut bool
156- if s .SilenceThreshold > 0 && energy < s .SilenceThreshold {
157- // If the energy is below the threshold, we consider it silence
158- if s .sts == - 1 {
159- // If this is the first silence, set the timestamp
160- s .sts = frame .Ts ()
161- } else if frame .Ts ()- s .sts >= s .SilenceDuration .Seconds () {
162- // Cut when the buffer size is greater than 10 seconds
163- if len (s .buf_flt ) >= s .sample_rate * 10 {
164- cut = true
165- }
166- s .sts = - 1 // Reset the silence timestamp
152+ // Calculate the energy of the frame and determine if we should "cut" the segment
153+ _ , cut := s .detect_silence (frame .Ts (), func () float64 {
154+ var sum float32
155+ for _ , sample := range data {
156+ sum += float32 (sample ) * float32 (sample )
167157 }
168- }
158+ return math .Sqrt (float64 (sum )/ float64 (len (data ))) / float64 (math .MaxInt16 )
159+ })
169160
170161 // Append float32 samples from plane 0 to buffer
171162 s .buf_flt = append (s .buf_flt , frame .Float32 (0 )... )
172163
164+ // TODO: If we don't have enough samples for a segment, or we are not cutting,
165+
173166 // n != 0 and len(buf) >= n we have a segment to process
174167 if (s .n != 0 && len (s .buf_flt ) >= s .n ) || cut {
175168 if err := s .segment_flt (fn ); err != nil {
@@ -203,6 +196,32 @@ func (s *Segmenter) DecodeFloat32(ctx context.Context, fn SegmentFuncFloat32) er
203196 return nil
204197}
205198
199+ func (s * Segmenter ) detect_silence (ts float64 , energy_fn func () float64 ) (float64 , bool ) {
200+ energy := energy_fn ()
201+
202+ // Segmenting or Silence detection is not enabled
203+ if s .SegmentSize == 0 || s .SilenceThreshold == 0 {
204+ return energy , false
205+ }
206+
207+ // If energy is above the threshold, reset the silence timestamp
208+ if energy >= s .SilenceThreshold {
209+ s .sts = - 1
210+ return energy , false
211+ }
212+
213+ // Set the first frame of silence
214+ if s .sts == - 1 {
215+ s .sts = ts
216+ return energy , false
217+ }
218+
219+ // Calculate the silence duration, and consider whether we consider this
220+ // a segment boundary.
221+ silence_duration := ts - s .sts
222+ return energy , silence_duration >= s .SilenceSize .Seconds ()
223+ }
224+
206225// Segments are output through a callback, with the samples and a timestamp
207226// At the moment the "best" audio stream is used, based on ffmpeg heuristic.
208227func (s * Segmenter ) DecodeInt16 (ctx context.Context , fn SegmentFuncInt16 ) error {
@@ -239,33 +258,23 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error
239258 return nil
240259 }
241260
242- // Calculate the energy of the frame - root mean squared and normalize between 0 and 1
243- var sum float32
244- var energy float64
245- for _ , sample := range data {
246- sum += float32 (sample ) * float32 (sample )
247- }
248- energy = math .Sqrt (float64 (sum )/ float64 (len (data ))) / float64 (math .MaxInt16 )
249-
250- // If silence detection is enabled, check if the energy is below the threshold
251- var cut bool
252- if s .SilenceThreshold > 0 && energy < s .SilenceThreshold {
253- // If the energy is below the threshold, we consider it silence
254- if s .sts == - 1 {
255- // If this is the first silence, set the timestamp
256- s .sts = frame .Ts ()
257- } else if frame .Ts ()- s .sts >= s .SilenceDuration .Seconds () {
258- // Cut when the buffer size is greater than 10 seconds
259- if len (s .buf_s16 ) >= s .sample_rate * 10 {
260- cut = true
261- }
262- s .sts = - 1 // Reset the silence timestamp
261+ // Calculate the energy of the frame and determine if we should "cut" the segment
262+ _ , cut := s .detect_silence (frame .Ts (), func () float64 {
263+ var sum float32
264+ for _ , sample := range data {
265+ sum += float32 (sample ) * float32 (sample )
263266 }
264- }
267+ return math .Sqrt (float64 (sum )/ float64 (len (data ))) / float64 (math .MaxInt16 )
268+ })
265269
266270 // Append int16 samples from plane 0 to buffer
267271 s .buf_s16 = append (s .buf_s16 , data ... )
268272
273+ // TODO: If we don't have enough samples for a segment, or we are not cutting
274+ if cut && len (s .buf_s16 ) < (s .n >> 1 ) {
275+ cut = false
276+ }
277+
269278 // n != 0 and len(buf) >= n we have a segment to process
270279 if (s .n != 0 && len (s .buf_s16 ) >= s .n ) || cut {
271280 if err := s .segment_s16 (fn ); err != nil {
@@ -303,11 +312,9 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error
303312// PRIVATE METHODS
304313
305314func (s * Segmenter ) segment_flt (fn SegmentFuncFloat32 ) error {
306- // TODO: Pad any remaining samples with zeros if the buffer is not full
307315 return fn (s .ts , s .buf_flt )
308316}
309317
310318func (s * Segmenter ) segment_s16 (fn SegmentFuncInt16 ) error {
311- // TODO: Pad any remaining samples with zeros if the buffer is not full
312319 return fn (s .ts , s .buf_s16 )
313320}
0 commit comments