content / browser / speech / speech_recognizer_impl_unittest.cc [blame]

// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "content/browser/speech/speech_recognizer_impl.h"

#include <stddef.h>
#include <stdint.h>

#include <memory>
#include <vector>

#include "base/containers/span.h"
#include "base/functional/bind.h"
#include "base/functional/callback_helpers.h"
#include "base/memory/ref_counted.h"
#include "base/numerics/byte_conversions.h"
#include "base/run_loop.h"
#include "base/synchronization/waitable_event.h"
#include "base/test/scoped_feature_list.h"
#include "base/threading/thread.h"
#include "content/browser/speech/network_speech_recognition_engine_impl.h"
#include "content/public/browser/google_streaming_api.pb.h"
#include "content/public/browser/speech_recognition_audio_forwarder_config.h"
#include "content/public/browser/speech_recognition_event_listener.h"
#include "content/public/common/content_features.h"
#include "content/public/test/browser_task_environment.h"
#include "media/audio/audio_device_description.h"
#include "media/audio/audio_system_impl.h"
#include "media/audio/fake_audio_input_stream.h"
#include "media/audio/fake_audio_output_stream.h"
#include "media/audio/mock_audio_manager.h"
#include "media/audio/test_audio_thread.h"
#include "media/base/audio_bus.h"
#include "media/base/audio_glitch_info.h"
#include "media/base/test_helpers.h"
#include "mojo/public/cpp/bindings/remote.h"
#include "mojo/public/cpp/system/data_pipe.h"
#include "mojo/public/cpp/system/data_pipe_utils.h"
#include "net/base/net_errors.h"
#include "net/http/http_response_headers.h"
#include "net/http/http_util.h"
#include "services/network/public/cpp/url_loader_completion_status.h"
#include "services/network/public/cpp/weak_wrapper_shared_url_loader_factory.h"
#include "services/network/public/mojom/url_response_head.mojom.h"
#include "services/network/test/test_url_loader_factory.h"
#include "testing/gtest/include/gtest/gtest.h"

using media::AudioInputStream;
using media::AudioOutputStream;
using media::AudioParameters;

namespace content {

namespace {

class MockCapturerSource : public media::AudioCapturerSource {
 public:
  MockCapturerSource() = default;
  MOCK_METHOD2(Initialize,
               void(const media::AudioParameters& params,
                    CaptureCallback* callback));
  MOCK_METHOD0(Start, void());
  MOCK_METHOD0(Stop, void());
  MOCK_METHOD1(SetAutomaticGainControl, void(bool enable));
  MOCK_METHOD1(SetVolume, void(double volume));
  MOCK_METHOD1(SetOutputDeviceForAec,
               void(const std::string& output_device_id));

 protected:
  ~MockCapturerSource() override = default;
};

}  // namespace

class SpeechRecognizerImplTest : public SpeechRecognitionEventListener,
                                 public testing::Test {
 public:
  SpeechRecognizerImplTest()
      : audio_capturer_source_(new testing::NiceMock<MockCapturerSource>()),
        recognition_started_(false),
        recognition_ended_(false),
        result_received_(false),
        audio_started_(false),
        audio_ended_(false),
        sound_started_(false),
        sound_ended_(false),
        error_(media::mojom::SpeechRecognitionErrorCode::kNone),
        volume_(-1.0f) {
    // This test environment is not set up to support out-of-process services.
    feature_list_.InitWithFeatures(
        /*enabled_features=*/{},
        /*disabled_features=*/{features::kAudioServiceOutOfProcess});

    // SpeechRecognizer takes ownership of sr_engine.
    std::unique_ptr<NetworkSpeechRecognitionEngineImpl> sr_engine =
        std::make_unique<NetworkSpeechRecognitionEngineImpl>(
            base::MakeRefCounted<network::WeakWrapperSharedURLLoaderFactory>(
                &url_loader_factory_),
            "" /* accept_language */);
    NetworkSpeechRecognitionEngineImpl::Config config;
    config.audio_num_bits_per_sample =
        SpeechRecognizerImpl::kNumBitsPerAudioSample;
    config.audio_sample_rate = SpeechRecognizerImpl::kAudioSampleRate;
    config.filter_profanities = false;
    sr_engine->SetConfig(config);

    const int kTestingSessionId = 1;

    audio_manager_ = std::make_unique<media::MockAudioManager>(
        std::make_unique<media::TestAudioThread>(true));
    audio_manager_->SetInputStreamParameters(
        media::AudioParameters::UnavailableDeviceParams());
    audio_system_ =
        std::make_unique<media::AudioSystemImpl>(audio_manager_.get());
    SpeechRecognizerImpl::SetAudioEnvironmentForTesting(
        audio_system_.get(), audio_capturer_source_.get());
    recognizer_ = new SpeechRecognizerImpl(this, audio_system_.get(),
                                           kTestingSessionId, false, false,
                                           std::move(sr_engine), std::nullopt);

    int audio_packet_length_bytes =
        (SpeechRecognizerImpl::kAudioSampleRate *
         NetworkSpeechRecognitionEngineImpl::kAudioPacketIntervalMs *
         ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout) *
         SpeechRecognizerImpl::kNumBitsPerAudioSample) /
        (8 * 1000);
    audio_packet_.resize(audio_packet_length_bytes);

    const int channels =
        ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout);
    int bytes_per_sample = SpeechRecognizerImpl::kNumBitsPerAudioSample / 8;
    const int frames = audio_packet_length_bytes / channels / bytes_per_sample;
    audio_bus_ = media::AudioBus::Create(channels, frames);
    audio_bus_->Zero();
  }

  ~SpeechRecognizerImplTest() override {
    SpeechRecognizerImpl::SetAudioEnvironmentForTesting(nullptr, nullptr);
    audio_manager_->Shutdown();
  }

  [[nodiscard]] bool GetUpstreamRequest(
      const network::TestURLLoaderFactory::PendingRequest**
          pending_request_out) {
    return GetPendingRequest(pending_request_out, "/up");
  }

  [[nodiscard]] bool GetDownstreamRequest(
      const network::TestURLLoaderFactory::PendingRequest**
          pending_request_out) {
    return GetPendingRequest(pending_request_out, "/down");
  }

  [[nodiscard]] bool GetPendingRequest(
      const network::TestURLLoaderFactory::PendingRequest** pending_request_out,
      const char* url_substring) {
    for (const auto& pending_request :
         *url_loader_factory_.pending_requests()) {
      if (pending_request.request.url.spec().find(url_substring) !=
          std::string::npos) {
        *pending_request_out = &pending_request;
        return true;
      }
    }
    return false;
  }

  void CheckEventsConsistency() {
    // Note: "!x || y" == "x implies y".
    EXPECT_TRUE(!recognition_ended_ || recognition_started_);
    EXPECT_TRUE(!audio_ended_ || audio_started_);
    EXPECT_TRUE(!sound_ended_ || sound_started_);
    EXPECT_TRUE(!audio_started_ || recognition_started_);
    EXPECT_TRUE(!sound_started_ || audio_started_);
    EXPECT_TRUE(!audio_ended_ || (sound_ended_ || !sound_started_));
    EXPECT_TRUE(!recognition_ended_ || (audio_ended_ || !audio_started_));
  }

  void CheckFinalEventsConsistency() {
    // Note: "!(x ^ y)" == "(x && y) || (!x && !x)".
    EXPECT_FALSE(recognition_started_ ^ recognition_ended_);
    EXPECT_FALSE(audio_started_ ^ audio_ended_);
    EXPECT_FALSE(sound_started_ ^ sound_ended_);
  }

  // Overridden from SpeechRecognitionEventListener:
  void OnAudioStart(int session_id) override {
    audio_started_ = true;
    CheckEventsConsistency();
  }

  void OnAudioEnd(int session_id) override {
    audio_ended_ = true;
    CheckEventsConsistency();
  }

  void OnRecognitionResults(
      int session_id,
      const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results)
      override {
    result_received_ = true;
  }

  void OnRecognitionError(
      int session_id,
      const media::mojom::SpeechRecognitionError& error) override {
    EXPECT_TRUE(recognition_started_);
    EXPECT_FALSE(recognition_ended_);
    error_ = error.code;
  }

  void OnAudioLevelsChange(int session_id,
                           float volume,
                           float noise_volume) override {
    volume_ = volume;
    noise_volume_ = noise_volume;
  }

  void OnRecognitionEnd(int session_id) override {
    recognition_ended_ = true;
    CheckEventsConsistency();
  }

  void OnRecognitionStart(int session_id) override {
    recognition_started_ = true;
    CheckEventsConsistency();
  }

  void OnSoundStart(int session_id) override {
    sound_started_ = true;
    CheckEventsConsistency();
  }

  void OnSoundEnd(int session_id) override {
    sound_ended_ = true;
    CheckEventsConsistency();
  }

  void CopyPacketToAudioBus() {
    static_assert(SpeechRecognizerImpl::kNumBitsPerAudioSample == 16,
                  "FromInterleaved expects 2 bytes.");
    // Copy the created signal into an audio bus in a deinterleaved format.
    audio_bus_->FromInterleaved<media::SignedInt16SampleTypeTraits>(
        reinterpret_cast<int16_t*>(audio_packet_.data()), audio_bus_->frames());
  }

  void FillPacketWithTestWaveform() {
    // Fill the input with a simple pattern, a 125Hz sawtooth waveform.
    for (size_t i = 0; i < audio_packet_.size(); ++i)
      audio_packet_[i] = static_cast<uint8_t>(i);
    CopyPacketToAudioBus();
  }

  void FillPacketWithNoise() {
    int value = 0;
    int factor = 175;
    for (size_t i = 0; i < audio_packet_.size(); ++i) {
      value += factor;
      audio_packet_[i] = value % 100;
    }
    CopyPacketToAudioBus();
  }

  void Capture(media::AudioBus* data) {
    auto* capture_callback =
        static_cast<media::AudioCapturerSource::CaptureCallback*>(
            recognizer_.get());
    capture_callback->Capture(data, base::TimeTicks::Now(), {}, 0.0, false);
  }

  void OnCaptureError() {
    auto* capture_callback =
        static_cast<media::AudioCapturerSource::CaptureCallback*>(
            recognizer_.get());
    capture_callback->OnCaptureError(
        media::AudioCapturerSource::ErrorCode::kUnknown, "");
  }

  void WaitForAudioThreadToPostDeviceInfo() {
    media::WaitableMessageLoopEvent event;
    audio_manager_->GetTaskRunner()->PostTaskAndReply(
        FROM_HERE, base::DoNothing(), event.GetClosure());
    // Runs the loop and waits for the audio thread to call event's closure,
    // which means AudioSystem reply containing device parameters is already
    // queued on the main thread.
    event.RunAndWait();
  }

 protected:
  base::test::ScopedFeatureList feature_list_;
  BrowserTaskEnvironment task_environment_;
  network::TestURLLoaderFactory url_loader_factory_;
  scoped_refptr<SpeechRecognizerImpl> recognizer_;
  std::unique_ptr<media::MockAudioManager> audio_manager_;
  std::unique_ptr<media::AudioSystem> audio_system_;
  scoped_refptr<MockCapturerSource> audio_capturer_source_;
  bool recognition_started_;
  bool recognition_ended_;
  bool result_received_;
  bool audio_started_;
  bool audio_ended_;
  bool sound_started_;
  bool sound_ended_;
  media::mojom::SpeechRecognitionErrorCode error_;
  std::vector<uint8_t> audio_packet_;
  std::unique_ptr<media::AudioBus> audio_bus_;
  float volume_;
  float noise_volume_;
};

TEST_F(SpeechRecognizerImplTest, StartNoInputDevices) {
  // Check for callbacks when stopping record before any audio gets recorded.
  audio_manager_->SetHasInputDevices(false);
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();  // EVENT_START processing.
  EXPECT_TRUE(recognition_started_);
  EXPECT_FALSE(audio_started_);
  EXPECT_FALSE(result_received_);
  OnCaptureError();
  base::RunLoop().RunUntilIdle();
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kAudioCapture, error_);
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, StartFakeInputDevice) {
  // Check for callbacks when stopping record before any audio gets recorded.
  audio_manager_->SetHasInputDevices(false);
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();  // EVENT_START processing.
  Capture(audio_bus_.get());
  recognizer_->StopAudioCapture();
  base::RunLoop().RunUntilIdle();
  EXPECT_TRUE(recognition_started_);
  EXPECT_TRUE(audio_started_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
  recognizer_->AbortRecognition();
  base::RunLoop().RunUntilIdle();
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, StopBeforeDeviceInfoReceived) {
  // Check for callbacks when stopping record before reply is received from
  // AudioSystem.
  base::WaitableEvent event(base::WaitableEvent::ResetPolicy::AUTOMATIC,
                            base::WaitableEvent::InitialState::NOT_SIGNALED);

  // Block audio thread.
  audio_manager_->GetTaskRunner()->PostTask(
      FROM_HERE,
      base::BindOnce(&base::WaitableEvent::Wait, base::Unretained(&event)));

  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  recognizer_->StopAudioCapture();
  base::RunLoop().RunUntilIdle();

  // Release audio thread and receive a callback from it.
  event.Signal();
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();

  EXPECT_TRUE(recognition_started_);
  EXPECT_FALSE(audio_started_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, CancelBeforeDeviceInfoReceived) {
  // Check for callbacks when stopping record before reply is received from
  // AudioSystem.
  base::WaitableEvent event(base::WaitableEvent::ResetPolicy::AUTOMATIC,
                            base::WaitableEvent::InitialState::NOT_SIGNALED);

  // Block audio thread.
  audio_manager_->GetTaskRunner()->PostTask(
      FROM_HERE,
      base::BindOnce(&base::WaitableEvent::Wait, base::Unretained(&event)));

  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  recognizer_->AbortRecognition();
  base::RunLoop().RunUntilIdle();

  // Release audio thread and receive a callback from it.
  event.Signal();
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();

  EXPECT_TRUE(recognition_started_);
  EXPECT_FALSE(audio_started_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, StopNoData) {
  // Check for callbacks when stopping record before any audio gets recorded.
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  recognizer_->StopAudioCapture();
  base::RunLoop().RunUntilIdle();  // EVENT_START and EVENT_STOP processing.
  EXPECT_TRUE(recognition_started_);
  EXPECT_FALSE(audio_started_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, CancelNoData) {
  // Check for callbacks when canceling recognition before any audio gets
  // recorded.
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  recognizer_->AbortRecognition();
  base::RunLoop().RunUntilIdle();  // EVENT_START and EVENT_ABORT processing.
  EXPECT_TRUE(recognition_started_);
  EXPECT_FALSE(audio_started_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kAborted, error_);
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, StopWithData) {
  // Start recording, give some data and then stop. This should wait for the
  // network callback to arrive before completion.
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();  // EVENT_START processing.

  // Try sending 5 chunks of mock audio data and verify that each of them
  // resulted immediately in a packet sent out via the network. This verifies
  // that we are streaming out encoded data as chunks without waiting for the
  // full recording to complete.
  const size_t kNumChunks = 5;
  mojo::Remote<network::mojom::ChunkedDataPipeGetter> chunked_data_pipe_getter;
  mojo::ScopedDataPipeProducerHandle producer_handle;
  mojo::ScopedDataPipeConsumerHandle consumer_handle;
  ASSERT_EQ(mojo::CreateDataPipe(nullptr, producer_handle, consumer_handle),
            MOJO_RESULT_OK);
  for (size_t i = 0; i < kNumChunks; ++i) {
    Capture(audio_bus_.get());

    if (i == 0) {
      // Set up data channel to read chunked upload data. Must be done after the
      // first OnData() call.
      base::RunLoop().RunUntilIdle();
      const network::TestURLLoaderFactory::PendingRequest* upstream_request;
      ASSERT_TRUE(GetUpstreamRequest(&upstream_request));
      ASSERT_TRUE(upstream_request->request.request_body);
      ASSERT_EQ(1u, upstream_request->request.request_body->elements()->size());
      auto& element =
          (*upstream_request->request.request_body->elements_mutable())[0];
      ASSERT_EQ(network::DataElement::Tag::kChunkedDataPipe, element.type());
      chunked_data_pipe_getter.Bind(
          element.As<network::DataElementChunkedDataPipe>()
              .ReleaseChunkedDataPipeGetter());
      chunked_data_pipe_getter->StartReading(std::move(producer_handle));
    }

    std::string data;
    while (true) {
      base::RunLoop().RunUntilIdle();

      base::span<const uint8_t> buffer;
      MojoResult result =
          consumer_handle->BeginReadData(MOJO_READ_DATA_FLAG_NONE, buffer);
      if (result == MOJO_RESULT_OK) {
        data.append(base::as_string_view(buffer));
        consumer_handle->EndReadData(buffer.size());
        continue;
      }
      if (result == MOJO_RESULT_SHOULD_WAIT) {
        // Some data has already been read, assume there's no more to read.
        if (!data.empty())
          break;
        continue;
      }

      FAIL() << "Mojo pipe closed unexpectedly";
    }

    EXPECT_FALSE(data.empty());
  }

  recognizer_->StopAudioCapture();
  base::RunLoop().RunUntilIdle();
  EXPECT_TRUE(audio_started_);
  EXPECT_TRUE(audio_ended_);
  EXPECT_FALSE(recognition_ended_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);

  // Create a response string.
  proto::SpeechRecognitionEvent proto_event;
  proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
  proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
  proto_result->set_final(true);
  proto::SpeechRecognitionAlternative* proto_alternative =
      proto_result->add_alternative();
  proto_alternative->set_confidence(0.5f);
  proto_alternative->set_transcript("123");
  std::string msg_string;
  proto_event.SerializeToString(&msg_string);
  msg_string.insert(0u, base::as_string_view(base::U32ToBigEndian(
                            base::checked_cast<uint32_t>(msg_string.size()))));

  // Issue the network callback to complete the process.
  const network::TestURLLoaderFactory::PendingRequest* downstream_request;
  ASSERT_TRUE(GetDownstreamRequest(&downstream_request));
  url_loader_factory_.AddResponse(downstream_request->request.url.spec(),
                                  msg_string);
  base::RunLoop().RunUntilIdle();

  EXPECT_TRUE(recognition_ended_);
  EXPECT_TRUE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, CancelWithData) {
  // Start recording, give some data and then cancel.
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();  // EVENT_START processing.
  Capture(audio_bus_.get());
  base::RunLoop().RunUntilIdle();
  recognizer_->AbortRecognition();
  base::RunLoop().RunUntilIdle();
  // There should be both upstream and downstream pending requests.
  ASSERT_EQ(2u, url_loader_factory_.pending_requests()->size());
  EXPECT_TRUE(recognition_started_);
  EXPECT_TRUE(audio_started_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kAborted, error_);
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, ConnectionError) {
  // Start recording, give some data and then stop. Issue the network callback
  // with a connection error and verify that the recognizer bubbles the error up
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();  // EVENT_START processing.
  Capture(audio_bus_.get());
  base::RunLoop().RunUntilIdle();
  // There should be both upstream and downstream pending requests.
  ASSERT_EQ(2u, url_loader_factory_.pending_requests()->size());

  recognizer_->StopAudioCapture();
  base::RunLoop().RunUntilIdle();
  EXPECT_TRUE(audio_started_);
  EXPECT_TRUE(audio_ended_);
  EXPECT_FALSE(recognition_ended_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);

  // Issue the network callback to complete the process.
  const network::TestURLLoaderFactory::PendingRequest* pending_request;
  ASSERT_TRUE(GetUpstreamRequest(&pending_request));
  url_loader_factory_.AddResponse(
      pending_request->request.url, network::mojom::URLResponseHead::New(), "",
      network::URLLoaderCompletionStatus(net::ERR_CONNECTION_REFUSED));

  base::RunLoop().RunUntilIdle();
  EXPECT_TRUE(recognition_ended_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNetwork, error_);
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, ServerError) {
  // Start recording, give some data and then stop. Issue the network callback
  // with a 500 error and verify that the recognizer bubbles the error up
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();  // EVENT_START processing.
  Capture(audio_bus_.get());
  base::RunLoop().RunUntilIdle();
  // There should be both upstream and downstream pending requests.
  ASSERT_EQ(2u, url_loader_factory_.pending_requests()->size());

  recognizer_->StopAudioCapture();
  base::RunLoop().RunUntilIdle();
  EXPECT_TRUE(audio_started_);
  EXPECT_TRUE(audio_ended_);
  EXPECT_FALSE(recognition_ended_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);

  const network::TestURLLoaderFactory::PendingRequest* pending_request;
  ASSERT_TRUE(GetUpstreamRequest(&pending_request));
  auto response = network::mojom::URLResponseHead::New();
  const char kHeaders[] = "HTTP/1.0 500 Internal Server Error";
  response->headers = base::MakeRefCounted<net::HttpResponseHeaders>(
      net::HttpUtil::AssembleRawHeaders(kHeaders));
  url_loader_factory_.AddResponse(pending_request->request.url,
                                  std::move(response), "",
                                  network::URLLoaderCompletionStatus());

  base::RunLoop().RunUntilIdle();
  EXPECT_TRUE(recognition_ended_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNetwork, error_);
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, OnCaptureError_PropagatesError) {
  // Check if things tear down properly if AudioInputController threw an error.
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();  // EVENT_START processing.

  OnCaptureError();
  base::RunLoop().RunUntilIdle();
  EXPECT_TRUE(recognition_started_);
  EXPECT_FALSE(audio_started_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kAudioCapture, error_);
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackIssued) {
  // Start recording and give a lot of packets with audio samples set to zero.
  // This should trigger the no-speech detector and issue a callback.
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();  // EVENT_START processing.

  int num_packets =
      (SpeechRecognizerImpl::kNoSpeechTimeoutMs) /
          NetworkSpeechRecognitionEngineImpl::kAudioPacketIntervalMs +
      1;
  // The vector is already filled with zero value samples on create.
  for (int i = 0; i < num_packets; ++i) {
    Capture(audio_bus_.get());
  }
  base::RunLoop().RunUntilIdle();
  EXPECT_TRUE(recognition_started_);
  EXPECT_TRUE(audio_started_);
  EXPECT_FALSE(result_received_);
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNoSpeech, error_);
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) {
  // Start recording and give a lot of packets with audio samples set to zero
  // and then some more with reasonably loud audio samples. This should be
  // treated as normal speech input and the no-speech detector should not get
  // triggered.
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();  // EVENT_START processing.

  int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) /
                    NetworkSpeechRecognitionEngineImpl::kAudioPacketIntervalMs;

  // The vector is already filled with zero value samples on create.
  for (int i = 0; i < num_packets / 2; ++i) {
    Capture(audio_bus_.get());
  }

  FillPacketWithTestWaveform();
  for (int i = 0; i < num_packets / 2; ++i) {
    Capture(audio_bus_.get());
  }

  base::RunLoop().RunUntilIdle();
  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
  EXPECT_TRUE(audio_started_);
  EXPECT_FALSE(audio_ended_);
  EXPECT_FALSE(recognition_ended_);
  recognizer_->AbortRecognition();
  base::RunLoop().RunUntilIdle();
  CheckFinalEventsConsistency();
}

TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) {
  // Start recording and give a lot of packets with audio samples set to zero
  // and then some more with reasonably loud audio samples. Check that we don't
  // get the callback during estimation phase, then get zero for the silence
  // samples and proper volume for the loud audio.
  recognizer_->StartRecognition(
      media::AudioDeviceDescription::kDefaultDeviceId);
  base::RunLoop().RunUntilIdle();  // EVENT_PREPARE processing.
  WaitForAudioThreadToPostDeviceInfo();
  base::RunLoop().RunUntilIdle();  // EVENT_START processing.

  // Feed some samples to begin with for the endpointer to do noise estimation.
  int num_packets = SpeechRecognizerImpl::kEndpointerEstimationTimeMs /
                    NetworkSpeechRecognitionEngineImpl::kAudioPacketIntervalMs;
  FillPacketWithNoise();
  for (int i = 0; i < num_packets; ++i) {
    Capture(audio_bus_.get());
  }
  base::RunLoop().RunUntilIdle();
  EXPECT_EQ(-1.0f, volume_);  // No audio volume set yet.

  // The vector is already filled with zero value samples on create.
  Capture(audio_bus_.get());
  base::RunLoop().RunUntilIdle();
  EXPECT_FLOAT_EQ(0.74939233f, volume_);

  FillPacketWithTestWaveform();
  Capture(audio_bus_.get());
  base::RunLoop().RunUntilIdle();
  EXPECT_NEAR(0.89926866f, volume_, 0.00001f);
  EXPECT_FLOAT_EQ(0.75071919f, noise_volume_);

  EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
  EXPECT_FALSE(audio_ended_);
  EXPECT_FALSE(recognition_ended_);
  recognizer_->AbortRecognition();
  base::RunLoop().RunUntilIdle();
  CheckFinalEventsConsistency();
}

}  // namespace content