1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
media / filters / audio_renderer_algorithm.h [blame]
// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// AudioRendererAlgorithm buffers and transforms audio data. The owner of
// this object provides audio data to the object through EnqueueBuffer() and
// requests data from the buffer via FillBuffer().
//
// This class is *not* thread-safe. Calls to enqueue and retrieve data must be
// locked if called from multiple threads.
//
// AudioRendererAlgorithm uses the Waveform Similarity Overlap and Add (WSOLA)
// algorithm to stretch or compress audio data to meet playback speeds less than
// or greater than the natural playback of the audio stream. The algorithm
// preserves local properties of the audio, therefore, pitch and harmonics are
// are preserved. See audio_renderer_algorith.cc for a more elaborate
// description of the algorithm.
//
// Audio at very low or very high playback rates are muted to preserve quality.
#ifndef MEDIA_FILTERS_AUDIO_RENDERER_ALGORITHM_H_
#define MEDIA_FILTERS_AUDIO_RENDERER_ALGORITHM_H_
#include <stdint.h>
#include <memory>
#include <optional>
#include <vector>
#include "base/memory/raw_ptr.h"
#include "base/memory/scoped_refptr.h"
#include "base/time/time.h"
#include "media/base/audio_buffer.h"
#include "media/base/audio_buffer_queue.h"
#include "media/base/audio_parameters.h"
#include "media/base/media_log.h"
#include "media/base/multi_channel_resampler.h"
namespace media {
class AudioBus;
class MEDIA_EXPORT AudioRendererAlgorithm {
public:
enum class FillBufferMode {
kPassthrough,
kResampler,
kWSOLA,
};
AudioRendererAlgorithm(MediaLog* media_log);
AudioRendererAlgorithm(MediaLog* media_log,
AudioRendererAlgorithmParameters params);
AudioRendererAlgorithm(const AudioRendererAlgorithm&) = delete;
AudioRendererAlgorithm& operator=(const AudioRendererAlgorithm&) = delete;
~AudioRendererAlgorithm();
// Initializes this object with information about the audio stream.
void Initialize(const AudioParameters& params, bool is_encrypted);
// Allows clients to specify which channels will be considered by the
// algorithm when adapting for playback rate, other channels will be muted.
// Useful to avoid performance overhead of the adapatation algorithm. Must
// only be called after Initialize(); may be called multiple times if the
// mask changes.
//
// E.g., If |channel_mask| is [true, false] only the first channel will be
// used to construct the playback rate adapted signal. This is useful if
// channel upmixing has been performed prior to this point.
void SetChannelMask(std::vector<bool> channel_mask);
// Tries to fill |requested_frames| frames into |dest| with possibly scaled
// data from our |audio_buffer_|. Data is scaled based on |playback_rate|,
// using a variation of the Overlap-Add method to combine sample windows.
//
// Data from |audio_buffer_| is consumed in proportion to the playback rate.
//
// |dest_offset| is the offset in frames for writing into |dest|.
//
// Returns the number of frames copied into |dest|.
int FillBuffer(AudioBus* dest,
int dest_offset,
int requested_frames,
double playback_rate);
// Clears |audio_buffer_|.
void FlushBuffers();
// Enqueues a buffer. It is called from the owner of the algorithm after a
// read completes.
void EnqueueBuffer(scoped_refptr<AudioBuffer> buffer_in);
// Sets a target queue latency. This target will be clamped and stored in
// |playback_threshold_|. It may also cause an increase in |capacity_|. A
// value of nullopt indicates the algorithm should restore the default value.
void SetLatencyHint(std::optional<base::TimeDelta> latency_hint);
// Sets a flag indicating whether apply pitch adjustments when playing back
// at rates other than 1.0. Concretely, we use WSOLA when this is true, and
// resampling when this is false.
void SetPreservesPitch(bool preserves_pitch);
// Returns true if the |audio_buffer_| is >= |playback_threshold_|.
bool IsQueueAdequateForPlayback();
// Returns the required size for |audio_buffer_| to be "adequate for
// playback". See IsQueueAdequateForPlayback().
int QueuePlaybackThreshold() const { return playback_threshold_; }
// Returns true if |audio_buffer_| is >= |capacity_|.
bool IsQueueFull();
// Returns the capacity of |audio_buffer_| in frames.
int QueueCapacity() const { return capacity_; }
// Increase the |playback_threshold_| and |capacity_| of |audio_buffer_| if
// possible. Should not be called if a custom |playback_threshold_| was
// specified.
void IncreasePlaybackThreshold();
// Sets a flag to bypass underflow detection, to read out all remaining data.
void MarkEndOfStream();
// Returns an estimate of the amount of memory (in bytes) used for frames.
int64_t GetMemoryUsage() const;
// Returns the total number of frames in |audio_buffer_| as well as
// unconsumed input frames in the |resampler_|. The returned value may be
// larger than QueueCapacity() in the event that EnqueueBuffer() delivered
// more data than |audio_buffer_| was intending to hold.
int BufferedFrames() const;
// Returns the effective delay in output frames at the given |playback rate|.
// Effectively this tells the caller, if new audio is enqueued via
// EnqueueBuffer(), how many frames must be read via FillBuffer() at the
// |playback_rate| before the new audio is read out. Note that this is
// approximate, since due to WSOLA the audio output doesn't always directly
// correspond to the audio input (some samples may be duplicated or skipped).
double DelayInFrames(double playback_rate) const;
// Returns the timestamp of the first AudioBuffer in `audio_buffer_` if any
// buffers exist.
std::optional<base::TimeDelta> FrontTimestamp() const;
// Returns the samples per second for this audio stream.
int samples_per_second() const { return samples_per_second_; }
std::vector<bool> channel_mask_for_testing() { return channel_mask_; }
FillBufferMode last_mode_for_testing() { return last_mode_; }
// WSOLA is a non-linear operation, so in order for AudioClock to be correct
// we need to expose the actual rate of input frames consumed. This is updated
// after every call to FillBuffer().
double effective_playback_rate() const { return effective_playback_rate_; }
private:
FillBufferMode ChooseBufferMode(double playback_rate);
// Remove buffered data that will be outdated if we switch fill mode.
void SetFillBufferMode(FillBufferMode mode);
// Within |search_block_|, find the block of data that is most similar to
// |target_block_|, and write it in |optimal_block_|. This method assumes that
// there is enough data to perform a search, i.e. |search_block_| and
// |target_block_| can be extracted from the available frames.
void GetOptimalBlock();
// Read a maximum of |requested_frames| frames from |wsola_output_|. Returns
// number of frames actually read.
int WriteCompletedFramesTo(
int requested_frames, int output_offset, AudioBus* dest);
// Fill |dest| with frames from |audio_buffer_| starting from frame
// |read_offset_frames|. |dest| is expected to have the same number of
// channels as |audio_buffer_|. A negative offset, i.e.
// |read_offset_frames| < 0, is accepted assuming that |audio_buffer| is zero
// for negative indices. This might happen for few first frames. This method
// assumes there is enough frames to fill |dest|, i.e. |read_offset_frames| +
// |dest->frames()| does not extend to future.
void PeekAudioWithZeroPrepend(int read_offset_frames, AudioBus* dest);
// Run one iteration of WSOLA, if there are sufficient frames. This will
// overlap-and-add one block to |wsola_output_|, hence, |num_complete_frames_|
// is incremented by |ola_hop_size_|.
bool RunOneWsolaIteration(double playback_rate);
// Seek |audio_buffer_| forward to remove frames from input that are not used
// any more. State of the WSOLA will be updated accordingly.
void RemoveOldInputFrames(double playback_rate);
// Update |output_time_| by |time_change|. In turn |search_block_index_| is
// updated.
void UpdateOutputTime(double playback_rate, double time_change);
// Is |target_block_| fully within |search_block_|? If so, we don't need to
// perform the search.
bool TargetIsWithinSearchRegion() const;
// Do we have enough data to perform one round of WSOLA?
bool CanPerformWsola() const;
// Creates or recreates |target_block_wrapper_| and |search_block_wrapper_|
// after a |channel_mask_| change. May be called at anytime after a channel
// mask has been specified.
void CreateSearchWrappers();
// Uses |resampler_| to speed up or slowdown audio, by using a resampling
// ratio of |playback_rate|.
int ResampleAndFill(AudioBus* dest,
int dest_offset,
int requested_frames,
double playback_rate);
// Uses the WSOLA algorithm to speed up or slowdown audio.
int RunWsolaAndFill(AudioBus* dest,
int dest_offset,
int requested_frames,
double playback_rate);
// Called by |resampler_| to get more audio data.
void OnResamplerRead(int frame_delay, AudioBus* audio_bus);
raw_ptr<MediaLog> media_log_;
// Parameters.
AudioRendererAlgorithmParameters audio_renderer_algorithm_params_;
// Number of channels in audio stream.
int channels_;
// Sample rate of audio stream.
int samples_per_second_;
// Is compressed audio output
bool is_bitstream_format_;
// Buffered audio data.
AudioBufferQueue audio_buffer_;
// Hint to adjust |playback_threshold_| as a means of controlling playback
// start latency. See SetLatencyHint();
std::optional<base::TimeDelta> latency_hint_;
// Whether to apply pitch adjusments or not when playing back at rates other
// than 1.0. In other words, we use WSOLA to preserve pitch when this is on,
// and resampling when this
bool preserves_pitch_ = true;
// How many frames to have in queue before beginning playback.
int64_t playback_threshold_;
// Minimum allowed value for |plabyack_threshold_| calculated by Initialize().
int64_t min_playback_threshold_;
// How many frames to have in the queue before we report the queue is full.
int64_t capacity_;
// Book keeping of the current time of generated audio, in frames. This
// should be appropriately updated when out samples are generated, regardless
// of whether we push samples out when FillBuffer() is called or we store
// audio in |wsola_output_| for the subsequent calls to FillBuffer().
// Furthermore, if samples from |audio_buffer_| are evicted then this
// member variable should be updated based on |playback_rate_|.
// Note that this member should be updated ONLY by calling UpdateOutputTime(),
// so that |search_block_index_| is update accordingly.
double output_time_;
// The offset of the center frame of |search_block_| w.r.t. its first frame.
int search_block_center_offset_;
// Index of the beginning of the |search_block_|, in frames.
int search_block_index_;
// Number of Blocks to search to find the most similar one to the target
// frame.
int num_candidate_blocks_;
// Index of the beginning of the target block, counted in frames.
int target_block_index_;
// Overlap-and-add window size in frames.
int ola_window_size_;
// The hop size of overlap-and-add in frames. This implementation assumes 50%
// overlap-and-add.
int ola_hop_size_;
// Number of frames in |wsola_output_| that overlap-and-add is completed for
// them and can be copied to output if FillBuffer() is called. It also
// specifies the index where the next WSOLA window has to overlap-and-add.
int num_complete_frames_;
bool reached_end_of_stream_ = false;
// Used to replace WSOLA algorithm at playback speeds close to 1.0. This is to
// prevent noticeable audio artifacts introduced by WSOLA, at the expense of
// changing the pitch of the audio.
std::unique_ptr<MultiChannelResampler> resampler_;
// True when the last call to OnResamplerRead() only gave silence to
// |resampler_|. Used to determine whether or not we have played out all the
// valid audio from |resampler.BufferedFrames()|.
bool resampler_only_has_silence_ = false;
// This stores a part of the output that is created but couldn't be rendered.
// Output is generated frame-by-frame which at some point might exceed the
// number of requested samples. Furthermore, due to overlap-and-add,
// the last half-window of the output is incomplete, which is stored in this
// buffer.
std::unique_ptr<AudioBus> wsola_output_;
// Overlap-and-add window.
std::unique_ptr<float[]> ola_window_;
// Transition window, used to update |optimal_block_| by a weighted sum of
// |optimal_block_| and |target_block_|.
std::unique_ptr<float[]> transition_window_;
// Auxiliary variables to avoid allocation in every iteration.
// Stores the optimal block in every iteration. This is the most
// similar block to |target_block_| within |search_block_| and it is
// overlap-and-added to |wsola_output_|.
std::unique_ptr<AudioBus> optimal_block_;
// A block of data that search is performed over to find the |optimal_block_|.
std::unique_ptr<AudioBus> search_block_;
// Stores the target block, denoted as |target| above. |search_block_| is
// searched for a block (|optimal_block_|) that is most similar to
// |target_block_|.
std::unique_ptr<AudioBus> target_block_;
// Active channels to consider while searching. Used to speed up WSOLA
// processing by ignoring always muted channels. Wrappers are always
// constructed during Initialize() and have <= |channels_|.
std::vector<bool> channel_mask_;
std::unique_ptr<AudioBus> search_block_wrapper_;
std::unique_ptr<AudioBus> target_block_wrapper_;
// The initial and maximum capacity calculated by Initialize().
int64_t initial_capacity_;
int64_t max_capacity_;
double effective_playback_rate_ = 0;
FillBufferMode last_mode_ = FillBufferMode::kPassthrough;
};
} // namespace media
#endif // MEDIA_FILTERS_AUDIO_RENDERER_ALGORITHM_H_