Skip to content

Commit c138a39

Browse files
authored
Merge pull request #49 from mutablelogic/avfilter
Updated segmenter
2 parents fd88d40 + ee4ce12 commit c138a39

File tree

2 files changed

+83
-63
lines changed

2 files changed

+83
-63
lines changed

pkg/segmenter/opt.go

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,18 @@ import (
1313
type Opt func(*opts) error
1414

1515
type opts struct {
16+
SegmentSize time.Duration // Segment size, zero means no segmenting
17+
SilenceSize time.Duration // Size of silence to consider a segment boundary
1618
SilenceThreshold float64 // Silence threshold
17-
SilenceDuration time.Duration // Duration of silence to consider a segment boundary
1819
}
1920

2021
///////////////////////////////////////////////////////////////////////////////////
2122
// GLOBALS
2223

2324
const (
24-
DefaultSilenceThreshold = 0.0005 // Default silence threshold
25-
DefaultSilenceDuration = time.Second * 2 // Default silence duration
25+
DefaultSilenceThreshold = 0.01 // Default silence threshold
26+
DefaultSilenceDuration = time.Millisecond * 500 // Default silence duration
27+
MinDuration = time.Millisecond * 250 // Minimum duration
2628
)
2729

2830
///////////////////////////////////////////////////////////////////////////////////
@@ -41,21 +43,32 @@ func applyOpts(opt ...Opt) (*opts, error) {
4143
///////////////////////////////////////////////////////////////////////////////////
4244
// TYPES
4345

44-
func WithDefaultSilenceThreshold() Opt {
46+
func WithSegmentSize(v time.Duration) Opt {
4547
return func(o *opts) error {
46-
o.SilenceThreshold = DefaultSilenceThreshold
47-
o.SilenceDuration = DefaultSilenceDuration
48+
if v < MinDuration {
49+
return media.ErrBadParameter.Withf("segment duration is too short, must be at least %v", MinDuration)
50+
} else {
51+
o.SegmentSize = v
52+
}
4853
return nil
4954
}
5055
}
5156

52-
func WithSilenceDuration(v time.Duration) Opt {
57+
func WithSilenceSize(v time.Duration) Opt {
5358
return func(o *opts) error {
54-
if v < time.Millisecond*100 {
55-
return media.ErrBadParameter.Withf("silence duration %s is too short, must be at least 100ms", v)
59+
if v < MinDuration {
60+
return media.ErrBadParameter.Withf("silence duration is too short, must be at least %v", MinDuration)
5661
} else {
57-
o.SilenceDuration = v
62+
o.SilenceSize = v
5863
}
5964
return nil
6065
}
6166
}
67+
68+
func WithDefaultSilenceThreshold() Opt {
69+
return func(o *opts) error {
70+
o.SilenceThreshold = DefaultSilenceThreshold
71+
o.SilenceSize = DefaultSilenceDuration
72+
return nil
73+
}
74+
}

pkg/segmenter/segmenter.go

Lines changed: 60 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,20 @@ const (
4646
//////////////////////////////////////////////////////////////////////////////
4747
// LIFECYCLE
4848

49-
// Create a new segmenter with a reader r which segments into raw audio of 'dur'
50-
// length. If dur is zero then no segmenting is performed, the whole
51-
// audio file is read and output in one go, which could cause some memory issues.
49+
// Create a new segmenter with a reader r which segments into raw audio.
5250
// The sample rate is the number of samples per second.
5351
//
52+
// Setting option WithSegmentSize will cause the segmenter to segment the audio
53+
// into fixed-size chunks approximately of the specified duration.
54+
//
55+
// Setting option WithDefaultSilenceThreshold will cause the segmenter to break
56+
// into smaller chunks, if silence is detected. The length of the silence is
57+
// specified by the WithSilenceDuration option, which defaults to 2 seconds.
58+
//
5459
// At the moment, the audio format is auto-detected, but there should be
55-
// a way to specify the audio format.
56-
func NewReader(r io.Reader, dur time.Duration, sample_rate int, opts ...Opt) (*Segmenter, error) {
60+
// a way to specify the audio format. The output samples are always single-channel
61+
// (mono).
62+
func NewReader(r io.Reader, sample_rate int, opts ...Opt) (*Segmenter, error) {
5763
segmenter := new(Segmenter)
5864

5965
// Apply options
@@ -64,14 +70,14 @@ func NewReader(r io.Reader, dur time.Duration, sample_rate int, opts ...Opt) (*S
6470
}
6571

6672
// Check arguments
67-
if dur < 0 || sample_rate <= 0 {
73+
if sample_rate <= 0 {
6874
return nil, media.ErrBadParameter.With("invalid duration or sample rate arguments")
6975
} else {
7076
segmenter.sample_rate = sample_rate
7177
}
7278

7379
// Sample buffer is duration * sample rate, assuming mono
74-
segmenter.n = int(dur.Seconds() * float64(sample_rate))
80+
segmenter.n = int(segmenter.opts.SegmentSize.Seconds() * float64(sample_rate))
7581

7682
// Open the file
7783
media, err := ffmpeg.NewReader(r)
@@ -143,33 +149,20 @@ func (s *Segmenter) DecodeFloat32(ctx context.Context, fn SegmentFuncFloat32) er
143149
return nil
144150
}
145151

146-
// Calculate the energy of the frame - root mean squared and normalize between 0 and 1
147-
var sum float32
148-
var energy float64
149-
for _, sample := range data {
150-
sum += float32(sample) * float32(sample)
151-
}
152-
energy = math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16)
153-
154-
// If silence detection is enabled, check if the energy is below the threshold
155-
var cut bool
156-
if s.SilenceThreshold > 0 && energy < s.SilenceThreshold {
157-
// If the energy is below the threshold, we consider it silence
158-
if s.sts == -1 {
159-
// If this is the first silence, set the timestamp
160-
s.sts = frame.Ts()
161-
} else if frame.Ts()-s.sts >= s.SilenceDuration.Seconds() {
162-
// Cut when the buffer size is greater than 10 seconds
163-
if len(s.buf_flt) >= s.sample_rate*10 {
164-
cut = true
165-
}
166-
s.sts = -1 // Reset the silence timestamp
152+
// Calculate the energy of the frame and determine if we should "cut" the segment
153+
_, cut := s.detect_silence(frame.Ts(), func() float64 {
154+
var sum float32
155+
for _, sample := range data {
156+
sum += float32(sample) * float32(sample)
167157
}
168-
}
158+
return math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16)
159+
})
169160

170161
// Append float32 samples from plane 0 to buffer
171162
s.buf_flt = append(s.buf_flt, frame.Float32(0)...)
172163

164+
// TODO: If we don't have enough samples for a segment, or we are not cutting,
165+
173166
// n != 0 and len(buf) >= n we have a segment to process
174167
if (s.n != 0 && len(s.buf_flt) >= s.n) || cut {
175168
if err := s.segment_flt(fn); err != nil {
@@ -203,6 +196,32 @@ func (s *Segmenter) DecodeFloat32(ctx context.Context, fn SegmentFuncFloat32) er
203196
return nil
204197
}
205198

199+
func (s *Segmenter) detect_silence(ts float64, energy_fn func() float64) (float64, bool) {
200+
energy := energy_fn()
201+
202+
// Segmenting or Silence detection is not enabled
203+
if s.SegmentSize == 0 || s.SilenceThreshold == 0 {
204+
return energy, false
205+
}
206+
207+
// If energy is above the threshold, reset the silence timestamp
208+
if energy >= s.SilenceThreshold {
209+
s.sts = -1
210+
return energy, false
211+
}
212+
213+
// Set the first frame of silence
214+
if s.sts == -1 {
215+
s.sts = ts
216+
return energy, false
217+
}
218+
219+
// Calculate the silence duration, and consider whether we consider this
220+
// a segment boundary.
221+
silence_duration := ts - s.sts
222+
return energy, silence_duration >= s.SilenceSize.Seconds()
223+
}
224+
206225
// Segments are output through a callback, with the samples and a timestamp
207226
// At the moment the "best" audio stream is used, based on ffmpeg heuristic.
208227
func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error {
@@ -239,33 +258,23 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error
239258
return nil
240259
}
241260

242-
// Calculate the energy of the frame - root mean squared and normalize between 0 and 1
243-
var sum float32
244-
var energy float64
245-
for _, sample := range data {
246-
sum += float32(sample) * float32(sample)
247-
}
248-
energy = math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16)
249-
250-
// If silence detection is enabled, check if the energy is below the threshold
251-
var cut bool
252-
if s.SilenceThreshold > 0 && energy < s.SilenceThreshold {
253-
// If the energy is below the threshold, we consider it silence
254-
if s.sts == -1 {
255-
// If this is the first silence, set the timestamp
256-
s.sts = frame.Ts()
257-
} else if frame.Ts()-s.sts >= s.SilenceDuration.Seconds() {
258-
// Cut when the buffer size is greater than 10 seconds
259-
if len(s.buf_s16) >= s.sample_rate*10 {
260-
cut = true
261-
}
262-
s.sts = -1 // Reset the silence timestamp
261+
// Calculate the energy of the frame and determine if we should "cut" the segment
262+
_, cut := s.detect_silence(frame.Ts(), func() float64 {
263+
var sum float32
264+
for _, sample := range data {
265+
sum += float32(sample) * float32(sample)
263266
}
264-
}
267+
return math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16)
268+
})
265269

266270
// Append int16 samples from plane 0 to buffer
267271
s.buf_s16 = append(s.buf_s16, data...)
268272

273+
// TODO: If we don't have enough samples for a segment, or we are not cutting
274+
if cut && len(s.buf_s16) < (s.n>>1) {
275+
cut = false
276+
}
277+
269278
// n != 0 and len(buf) >= n we have a segment to process
270279
if (s.n != 0 && len(s.buf_s16) >= s.n) || cut {
271280
if err := s.segment_s16(fn); err != nil {
@@ -303,11 +312,9 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error
303312
// PRIVATE METHODS
304313

305314
func (s *Segmenter) segment_flt(fn SegmentFuncFloat32) error {
306-
// TODO: Pad any remaining samples with zeros if the buffer is not full
307315
return fn(s.ts, s.buf_flt)
308316
}
309317

310318
func (s *Segmenter) segment_s16(fn SegmentFuncInt16) error {
311-
// TODO: Pad any remaining samples with zeros if the buffer is not full
312319
return fn(s.ts, s.buf_s16)
313320
}

0 commit comments

Comments
 (0)