media / mojo / mojom / speech_recognizer.mojom [blame]

// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

module media.mojom;

import "media/mojo/mojom/speech_recognition_audio_forwarder.mojom";
import "media/mojo/mojom/speech_recognition_error.mojom";
import "media/mojo/mojom/speech_recognition_grammar.mojom";
import "media/mojo/mojom/speech_recognition_result.mojom";
import "media/mojo/mojom/speech_recognition_recognition_context.mojom";

// Created by the renderer and sent to the browser to start a speech recognition
// session.
struct StartSpeechRecognitionRequestParams {
  // Used to create a connection with a SpeechRecognitionSession implementation
  // that will be created when the session is created.
  pending_receiver<SpeechRecognitionSession> session_receiver;

  // Used by the browser to communicate with a SpeechRecognitionSessionClient
  // implementation created for the new session.
  pending_remote<SpeechRecognitionSessionClient> client;

  // Language to use for speech recognition.
  string language;

  // Speech grammars to use.
  array<SpeechRecognitionGrammar> grammars;

  // The optional recognition context for speech recognition biasing.
  SpeechRecognitionRecognitionContext? recognition_context;

  // Maximum number of hypotheses allowed for each results.
  uint32 max_hypotheses;

  // Whether the user requested continuous recognition.
  bool continuous;

  // Whether the user requested interim results.
  bool interim_results;

  // Whether the speech recognition may happen on-device.
  bool on_device;

  // Whether the speech recognition is allowed to fallback to a Cloud-based
  // speech recognition service.
  bool allow_cloud_fallback;

  // Used to pass audio from the renderer to the browser.
  pending_receiver<media.mojom.SpeechRecognitionAudioForwarder>?
      audio_forwarder;

  // The channel count of the forwarded audio.
  int32 channel_count;

  // The sample rate of the forwarded audio.
  int32 sample_rate;
};

// API for the renderer process to manage speech recognition in the browser
// process. The remote lives in the renderer process and the receiver lives in
// the browser process.
interface SpeechRecognizer {
  // Requests the speech recognition service to start speech recognition.
  Start(StartSpeechRecognitionRequestParams params);
};

// API for the renderer process to manage on-device speech recognition in the
// browser process. The remote lives in the renderer process and the receiver
// lives in
// the browser process.
interface OnDeviceSpeechRecognition {
  // Returns whether on-device speech recognition is available for a given
  // language. Takes in a BCP 47 language tag (e.g. "en-US").
  OnDeviceWebSpeechAvailable(string language) => (bool available);

  // Returns whether on-device speech recognition installation was successfully
  // initiated for the given language.  Takes in a BCP 47 language tag (e.g.
  // "en-US").
  InstallOnDeviceSpeechRecognition(string language) => (bool success);
};

// API for the renderer process to stop or abort an existing speech recognition
// session. An InterfaceRequest is sent to the browser process via
// SpeechRecognizer::Start, and is bound to an implementation there.
// SpeechRecognitionSession and SpeechRecognitionSessionClient are 1:1 with each
// other and with WebSpeechRecognitionHandle.
[Stable]
interface SpeechRecognitionSession {
  // Requests the speech recognition service to abort speech recognition for the
  // associated session.
  Abort@0();

  // Requests the speech recognition service to stop audio capture for the
  // associated session.
  StopCapture@1();
};

// API for the browser process to communicate speech recognition related updates
// with renderer and cause events to be dispatched to the appropriate speech
// recognition handle. An InterfacePtr for each handle is sent to the browser
// process via SpeechRecognizer::Start. SpeechRecognitionSession and
// SpeechRecognitionSessionClient are 1:1 with each other and with
// WebSpeechRecognitionHandle.
[Stable]
interface SpeechRecognitionSessionClient {
  // Called to dispatch the "result" event.
  ResultRetrieved@0(array<WebSpeechRecognitionResult> results);

  // Called to dispatch the "nomatch" event if the error code passed is of types
  // kNoMatch, otherwise dispatchers an "error" event.
  ErrorOccurred@1(SpeechRecognitionError error);

  // Called to dispatch the "start" event.
  Started@2();

  // Called to dispatch the "audiostart" event.
  AudioStarted@3();

  // Called to  dispatch the "soundstart" and "speechstart" events.
  SoundStarted@4();

  // Called to dispatch "soundend" and "speechend" events.
  SoundEnded@5();

  // Called to dispatch the "audioend" event.
  AudioEnded@6();

  // Called to dispatch the "end" event.
  Ended@7();
};