1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221
  222
  223
  224
  225
  226
  227
  228
  229
  230
  231
  232
  233
  234
  235
  236
  237
  238
  239
  240
  241
  242
  243
  244
  245
  246
  247
  248
  249
  250
  251
  252
  253
  254
  255
  256
  257
  258
  259
  260
  261
  262
  263
  264
  265
  266
  267
  268
  269
  270
  271
  272
  273
  274
  275
  276
  277
  278
  279
  280
  281
  282
  283
  284
  285
  286
  287
  288
  289
  290
  291
  292
  293
  294
  295
  296
  297
  298
  299
  300
  301
  302
  303
  304
  305
  306
  307
  308
  309
  310
  311
  312
  313
  314
  315
  316
  317
  318
  319
  320
  321
  322
  323
  324
  325
  326
  327
  328
  329
  330
  331
  332
  333
  334
  335
  336
  337
  338
  339
  340
  341
  342
  343
  344
  345
  346
  347
  348
  349
  350
  351
  352
  353
  354
  355

media / filters / audio_renderer_algorithm.h [blame]

// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// AudioRendererAlgorithm buffers and transforms audio data. The owner of
// this object provides audio data to the object through EnqueueBuffer() and
// requests data from the buffer via FillBuffer().
//
// This class is *not* thread-safe. Calls to enqueue and retrieve data must be
// locked if called from multiple threads.
//
// AudioRendererAlgorithm uses the Waveform Similarity Overlap and Add (WSOLA)
// algorithm to stretch or compress audio data to meet playback speeds less than
// or greater than the natural playback of the audio stream. The algorithm
// preserves local properties of the audio, therefore, pitch and harmonics are
// are preserved. See audio_renderer_algorith.cc for a more elaborate
// description of the algorithm.
//
// Audio at very low or very high playback rates are muted to preserve quality.

#ifndef MEDIA_FILTERS_AUDIO_RENDERER_ALGORITHM_H_
#define MEDIA_FILTERS_AUDIO_RENDERER_ALGORITHM_H_

#include <stdint.h>

#include <memory>
#include <optional>
#include <vector>

#include "base/memory/raw_ptr.h"
#include "base/memory/scoped_refptr.h"
#include "base/time/time.h"
#include "media/base/audio_buffer.h"
#include "media/base/audio_buffer_queue.h"
#include "media/base/audio_parameters.h"
#include "media/base/media_log.h"
#include "media/base/multi_channel_resampler.h"

namespace media {

class AudioBus;

class MEDIA_EXPORT AudioRendererAlgorithm {
 public:
  enum class FillBufferMode {
    kPassthrough,
    kResampler,
    kWSOLA,
  };

  AudioRendererAlgorithm(MediaLog* media_log);
  AudioRendererAlgorithm(MediaLog* media_log,
                         AudioRendererAlgorithmParameters params);

  AudioRendererAlgorithm(const AudioRendererAlgorithm&) = delete;
  AudioRendererAlgorithm& operator=(const AudioRendererAlgorithm&) = delete;

  ~AudioRendererAlgorithm();

  // Initializes this object with information about the audio stream.
  void Initialize(const AudioParameters& params, bool is_encrypted);

  // Allows clients to specify which channels will be considered by the
  // algorithm when adapting for playback rate, other channels will be muted.
  // Useful to avoid performance overhead of the adapatation algorithm. Must
  // only be called after Initialize(); may be called multiple times if the
  // mask changes.
  //
  // E.g., If |channel_mask| is [true, false] only the first channel will be
  // used to construct the playback rate adapted signal. This is useful if
  // channel upmixing has been performed prior to this point.
  void SetChannelMask(std::vector<bool> channel_mask);

  // Tries to fill |requested_frames| frames into |dest| with possibly scaled
  // data from our |audio_buffer_|. Data is scaled based on |playback_rate|,
  // using a variation of the Overlap-Add method to combine sample windows.
  //
  // Data from |audio_buffer_| is consumed in proportion to the playback rate.
  //
  // |dest_offset| is the offset in frames for writing into |dest|.
  //
  // Returns the number of frames copied into |dest|.
  int FillBuffer(AudioBus* dest,
                 int dest_offset,
                 int requested_frames,
                 double playback_rate);

  // Clears |audio_buffer_|.
  void FlushBuffers();

  // Enqueues a buffer. It is called from the owner of the algorithm after a
  // read completes.
  void EnqueueBuffer(scoped_refptr<AudioBuffer> buffer_in);

  // Sets a target queue latency. This target will be clamped and stored in
  // |playback_threshold_|. It may also cause an increase in |capacity_|. A
  // value of nullopt indicates the algorithm should restore the default value.
  void SetLatencyHint(std::optional<base::TimeDelta> latency_hint);

  // Sets a flag indicating whether apply pitch adjustments when playing back
  // at rates other than 1.0. Concretely, we use WSOLA when this is true, and
  // resampling when this is false.
  void SetPreservesPitch(bool preserves_pitch);

  // Returns true if the |audio_buffer_| is >= |playback_threshold_|.
  bool IsQueueAdequateForPlayback();

  // Returns the required size for |audio_buffer_| to be "adequate for
  // playback". See IsQueueAdequateForPlayback().
  int QueuePlaybackThreshold() const { return playback_threshold_; }

  // Returns true if |audio_buffer_| is >= |capacity_|.
  bool IsQueueFull();

  // Returns the capacity of |audio_buffer_| in frames.
  int QueueCapacity() const { return capacity_; }

  // Increase the |playback_threshold_| and |capacity_| of |audio_buffer_| if
  // possible. Should not be called if a custom |playback_threshold_| was
  // specified.
  void IncreasePlaybackThreshold();

  // Sets a flag to bypass underflow detection, to read out all remaining data.
  void MarkEndOfStream();

  // Returns an estimate of the amount of memory (in bytes) used for frames.
  int64_t GetMemoryUsage() const;

  // Returns the total number of frames in |audio_buffer_| as well as
  // unconsumed input frames in the |resampler_|. The returned value may be
  // larger than QueueCapacity() in the event that EnqueueBuffer() delivered
  // more data than |audio_buffer_| was intending to hold.
  int BufferedFrames() const;

  // Returns the effective delay in output frames at the given |playback rate|.
  // Effectively this tells the caller, if new audio is enqueued via
  // EnqueueBuffer(), how many frames must be read via FillBuffer() at the
  // |playback_rate| before the new audio is read out. Note that this is
  // approximate, since due to WSOLA the audio output doesn't always directly
  // correspond to the audio input (some samples may be duplicated or skipped).
  double DelayInFrames(double playback_rate) const;

  // Returns the timestamp of the first AudioBuffer in `audio_buffer_` if any
  // buffers exist.
  std::optional<base::TimeDelta> FrontTimestamp() const;

  // Returns the samples per second for this audio stream.
  int samples_per_second() const { return samples_per_second_; }

  std::vector<bool> channel_mask_for_testing() { return channel_mask_; }

  FillBufferMode last_mode_for_testing() { return last_mode_; }

  // WSOLA is a non-linear operation, so in order for AudioClock to be correct
  // we need to expose the actual rate of input frames consumed. This is updated
  // after every call to FillBuffer().
  double effective_playback_rate() const { return effective_playback_rate_; }

 private:
  FillBufferMode ChooseBufferMode(double playback_rate);

  // Remove buffered data that will be outdated if we switch fill mode.
  void SetFillBufferMode(FillBufferMode mode);

  // Within |search_block_|, find the block of data that is most similar to
  // |target_block_|, and write it in |optimal_block_|. This method assumes that
  // there is enough data to perform a search, i.e. |search_block_| and
  // |target_block_| can be extracted from the available frames.
  void GetOptimalBlock();

  // Read a maximum of |requested_frames| frames from |wsola_output_|. Returns
  // number of frames actually read.
  int WriteCompletedFramesTo(
      int requested_frames, int output_offset, AudioBus* dest);

  // Fill |dest| with frames from |audio_buffer_| starting from frame
  // |read_offset_frames|. |dest| is expected to have the same number of
  // channels as |audio_buffer_|. A negative offset, i.e.
  // |read_offset_frames| < 0, is accepted assuming that |audio_buffer| is zero
  // for negative indices. This might happen for few first frames. This method
  // assumes there is enough frames to fill |dest|, i.e. |read_offset_frames| +
  // |dest->frames()| does not extend to future.
  void PeekAudioWithZeroPrepend(int read_offset_frames, AudioBus* dest);

  // Run one iteration of WSOLA, if there are sufficient frames. This will
  // overlap-and-add one block to |wsola_output_|, hence, |num_complete_frames_|
  // is incremented by |ola_hop_size_|.
  bool RunOneWsolaIteration(double playback_rate);

  // Seek |audio_buffer_| forward to remove frames from input that are not used
  // any more. State of the WSOLA will be updated accordingly.
  void RemoveOldInputFrames(double playback_rate);

  // Update |output_time_| by |time_change|. In turn |search_block_index_| is
  // updated.
  void UpdateOutputTime(double playback_rate, double time_change);

  // Is |target_block_| fully within |search_block_|? If so, we don't need to
  // perform the search.
  bool TargetIsWithinSearchRegion() const;

  // Do we have enough data to perform one round of WSOLA?
  bool CanPerformWsola() const;

  // Creates or recreates |target_block_wrapper_| and |search_block_wrapper_|
  // after a |channel_mask_| change. May be called at anytime after a channel
  // mask has been specified.
  void CreateSearchWrappers();

  // Uses |resampler_| to speed up or slowdown audio, by using a resampling
  // ratio of |playback_rate|.
  int ResampleAndFill(AudioBus* dest,
                      int dest_offset,
                      int requested_frames,
                      double playback_rate);

  // Uses the WSOLA algorithm to speed up or slowdown audio.
  int RunWsolaAndFill(AudioBus* dest,
                      int dest_offset,
                      int requested_frames,
                      double playback_rate);

  // Called by |resampler_| to get more audio data.
  void OnResamplerRead(int frame_delay, AudioBus* audio_bus);

  raw_ptr<MediaLog> media_log_;

  // Parameters.
  AudioRendererAlgorithmParameters audio_renderer_algorithm_params_;

  // Number of channels in audio stream.
  int channels_;

  // Sample rate of audio stream.
  int samples_per_second_;

  // Is compressed audio output
  bool is_bitstream_format_;

  // Buffered audio data.
  AudioBufferQueue audio_buffer_;

  // Hint to adjust |playback_threshold_| as a means of controlling playback
  // start latency. See SetLatencyHint();
  std::optional<base::TimeDelta> latency_hint_;

  // Whether to apply pitch adjusments or not when playing back at rates other
  // than 1.0. In other words, we use WSOLA to preserve pitch when this is on,
  // and resampling when this
  bool preserves_pitch_ = true;

  // How many frames to have in queue before beginning playback.
  int64_t playback_threshold_;

  // Minimum allowed value for |plabyack_threshold_| calculated by Initialize().
  int64_t min_playback_threshold_;

  // How many frames to have in the queue before we report the queue is full.
  int64_t capacity_;

  // Book keeping of the current time of generated audio, in frames. This
  // should be appropriately updated when out samples are generated, regardless
  // of whether we push samples out when FillBuffer() is called or we store
  // audio in |wsola_output_| for the subsequent calls to FillBuffer().
  // Furthermore, if samples from |audio_buffer_| are evicted then this
  // member variable should be updated based on |playback_rate_|.
  // Note that this member should be updated ONLY by calling UpdateOutputTime(),
  // so that |search_block_index_| is update accordingly.
  double output_time_;

  // The offset of the center frame of |search_block_| w.r.t. its first frame.
  int search_block_center_offset_;

  // Index of the beginning of the |search_block_|, in frames.
  int search_block_index_;

  // Number of Blocks to search to find the most similar one to the target
  // frame.
  int num_candidate_blocks_;

  // Index of the beginning of the target block, counted in frames.
  int target_block_index_;

  // Overlap-and-add window size in frames.
  int ola_window_size_;

  // The hop size of overlap-and-add in frames. This implementation assumes 50%
  // overlap-and-add.
  int ola_hop_size_;

  // Number of frames in |wsola_output_| that overlap-and-add is completed for
  // them and can be copied to output if FillBuffer() is called. It also
  // specifies the index where the next WSOLA window has to overlap-and-add.
  int num_complete_frames_;

  bool reached_end_of_stream_ = false;

  // Used to replace WSOLA algorithm at playback speeds close to 1.0. This is to
  // prevent noticeable audio artifacts introduced by WSOLA, at the expense of
  // changing the pitch of the audio.
  std::unique_ptr<MultiChannelResampler> resampler_;

  // True when the last call to OnResamplerRead() only gave silence to
  // |resampler_|. Used to determine whether or not we have played out all the
  // valid audio from |resampler.BufferedFrames()|.
  bool resampler_only_has_silence_ = false;

  // This stores a part of the output that is created but couldn't be rendered.
  // Output is generated frame-by-frame which at some point might exceed the
  // number of requested samples. Furthermore, due to overlap-and-add,
  // the last half-window of the output is incomplete, which is stored in this
  // buffer.
  std::unique_ptr<AudioBus> wsola_output_;

  // Overlap-and-add window.
  std::unique_ptr<float[]> ola_window_;

  // Transition window, used to update |optimal_block_| by a weighted sum of
  // |optimal_block_| and |target_block_|.
  std::unique_ptr<float[]> transition_window_;

  // Auxiliary variables to avoid allocation in every iteration.

  // Stores the optimal block in every iteration. This is the most
  // similar block to |target_block_| within |search_block_| and it is
  // overlap-and-added to |wsola_output_|.
  std::unique_ptr<AudioBus> optimal_block_;

  // A block of data that search is performed over to find the |optimal_block_|.
  std::unique_ptr<AudioBus> search_block_;

  // Stores the target block, denoted as |target| above. |search_block_| is
  // searched for a block (|optimal_block_|) that is most similar to
  // |target_block_|.
  std::unique_ptr<AudioBus> target_block_;

  // Active channels to consider while searching. Used to speed up WSOLA
  // processing by ignoring always muted channels. Wrappers are always
  // constructed during Initialize() and have <= |channels_|.
  std::vector<bool> channel_mask_;
  std::unique_ptr<AudioBus> search_block_wrapper_;
  std::unique_ptr<AudioBus> target_block_wrapper_;

  // The initial and maximum capacity calculated by Initialize().
  int64_t initial_capacity_;
  int64_t max_capacity_;

  double effective_playback_rate_ = 0;

  FillBufferMode last_mode_ = FillBufferMode::kPassthrough;
};

}  // namespace media

#endif  // MEDIA_FILTERS_AUDIO_RENDERER_ALGORITHM_H_