1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221
  222
  223
  224
  225
  226
  227
  228
  229
  230
  231
  232
  233
  234
  235
  236
  237
  238
  239
  240
  241
  242
  243
  244
  245
  246
  247
  248
  249
  250
  251
  252
  253
  254
  255
  256
  257
  258
  259
  260
  261
  262
  263
  264
  265
  266
  267
  268
  269
  270
  271
  272
  273
  274
  275
  276
  277
  278
  279
  280
  281
  282
  283
  284
  285
  286
  287
  288
  289
  290
  291
  292
  293
  294
  295
  296
  297
  298
  299
  300
  301
  302
  303
  304
  305
  306
  307
  308
  309
  310
  311
  312
  313
  314
  315
  316
  317
  318
  319
  320
  321
  322
  323
  324
  325
  326
  327
  328
  329
  330
  331

media / mojo / mojom / speech_recognition.mojom [blame]

// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

module media.mojom;

import "media/mojo/mojom/audio_data.mojom";
import "media/mojo/mojom/speech_recognition_audio_forwarder.mojom";
import "media/mojo/mojom/speech_recognition_recognition_context.mojom";
import "media/mojo/mojom/speech_recognizer.mojom";
import "mojo/public/mojom/base/time.mojom";
import "mojo/public/mojom/base/unguessable_token.mojom";
import "ui/gfx/geometry/mojom/geometry.mojom";

// Next MinVersion: 9

// Corresponds to the LangIdEvent.ConfidenceInterval defined in
// http://google3/speech/soda/public/soda_event.proto.
[Stable, Extensible]
enum ConfidenceLevel {
  [Default] kUnknown,
  kNotConfident,
  kConfident,
  kHighlyConfident,
};

// Corresponds to the LangIdEvent.AsrSwitchResult defined in
// http://google3/speech/soda/public/soda_event.proto.
[Stable, Extensible]
enum AsrSwitchResult {
  [Default] kDefaultNoSwitch,
  kSwitchSucceeded,
  kSwitchFailed,
  kSwitchSkipedNoLp,
};

// The main interface a client uses to interact with a speech
// recognition service process. For Live Caption, every renderer can own one
// or more Remote<SpeechRecognitionContext>, with the receiver bound through the
// BrowserInterfaceBroker. For the Web Speech API, the browser is the client and
// can own one or more Remote<SpeechRecognitionContext>. This is a stable
// interface that is used across the LaCrOS/Ash boundary.
[Stable]
interface SpeechRecognitionContext {
  // Bind the recognizers to the speech recognition service. Returns a flag
  // indicating whether multichannel audio is supported by the speech
  // recognition service.
  BindRecognizer@0(pending_receiver<SpeechRecognitionRecognizer> receiver,
                   pending_remote<SpeechRecognitionRecognizerClient> client,
                   SpeechRecognitionOptions options)
      => (bool is_multichannel_supported);

  // Bind the recognizer used by the Web Speech API.
  [MinVersion=6]
  BindWebSpeechRecognizer@1(
      pending_receiver<media.mojom.SpeechRecognitionSession> session_receiver,
      pending_remote<media.mojom.SpeechRecognitionSessionClient> session_client,
      pending_receiver<SpeechRecognitionAudioForwarder> audio_forwarder,
      int32 channel_count,
      int32 sample_rate,
      SpeechRecognitionOptions options,
      bool continuous);
};

// The interface used to pass raw audio from the renderer to the speech
// recognition service. The remote lives either in the renderer process (for web
// Live Caption) or the browser process (for CrOS features like system Live
// Caption and dictation) and the receiver lives in the speech recognition
// process.
[Stable]
interface SpeechRecognitionRecognizer {
  // Initialize the speech recognition instance. The speech recognition client
  // will return the recognition events containing the transcribed audio back
  // to the originating media.
  SendAudioToSpeechRecognitionService@0(AudioDataS16 buffer);

  // Mark audio stream done. This informs the speech recognition client to stop
  // speech recognition after it finishes processing the audio it has received
  // already. This will eventually trigger the
  // SpeechRecognitionRecognizerClient::OnSpeechRecognitionStopped callback.
  MarkDone@1();

  // Notify the speech recognition recognizer that the language changed. Takes
  // in the locale string (e.g. "en-US").
  OnLanguageChanged@2(string language);

  // Notify the speech recognition recognizer that the mask offensive words
  // setting has changed.
  [MinVersion=2]
  OnMaskOffensiveWordsChanged@3(bool mask_offensive_words);
};

// The interface used to return speech recognition events from the speech
// recognition service to the client that will display the results to the user.
// The remote lives in the speech recognition process and the receiver lives in
// the browser process.
[Stable]
interface SpeechRecognitionRecognizerClient {
  // Triggered by speech recognition process on a speech recognition event.
  //
  // Returns false if the client wants to halt speech recognition e.g. in
  // response to user input or in the case of an error.
  OnSpeechRecognitionRecognitionEvent@0(SpeechRecognitionResult result)
      => (bool continue_recognition);

  // Called when speech recognition stops.
  OnSpeechRecognitionStopped@1();

  // Triggered by an error within the speech recognition service.
  OnSpeechRecognitionError@2();

  // Triggered by speech recognition process on a language identification event.
  OnLanguageIdentificationEvent@3(LanguageIdentificationEvent event);
};

// The hypothesis parts that provides timing information for each word in
// recognized speech.
[Stable]
struct HypothesisParts {
  // A section of the final transcription text. Either an entire word or single
  // character (depending on the language) with adjacent punctuation. There will
  // usually only be one value here. If formatting is enabled in the speech
  // recognition, then the raw text will be included as the second element.
  array<string> text;

  // Time offset from this event's |audio_start_time| defined below. We enforce
  // the following invariant: 0 <= hypothesis_part_offset < |audio_end_time -
  // audio_start_time|.
  mojo_base.mojom.TimeDelta hypothesis_part_offset;
};

// The timing information for the transcript.
[Stable]
struct TimingInformation {
  // Start time in audio time from the start of the SODA session.
  // This time measures the amount of audio input into SODA.
  mojo_base.mojom.TimeDelta audio_start_time;

  // Elapsed processed audio from first frame after preamble.
  mojo_base.mojom.TimeDelta audio_end_time;

  // The timing information for each word/letter in the transription.
  // HypothesisPartsInResult was introduced in min version 1 in
  // chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
  // must be optional. Hypothesis parts maybe non-empty optional containing a
  // zero length vector if no words were spoken during the event's time span.
  array<HypothesisParts>? hypothesis_parts;
};

// A speech recognition result created by the speech service and passed to the
// browser.
[Stable]
struct SpeechRecognitionResult {
  string transcription;

  // A flag indicating whether the result is final. If true, the result is
  // locked in and the next result returned will not overlap with the previous
  // final result.
  bool is_final;

  // Timing information for the current transcription. |timing_information| is
  // expected to be valid if:
  //   1. speech recognition is provided by |CrosSodaClient| and
  //   2. |is_final| is true.
  TimingInformation? timing_information;
};

// A language identification event created by the speech recognition service
// and passed to the browser and renderer.
[Stable]
struct LanguageIdentificationEvent {
  // The locale of the language with the highest confidence.
  string language;

  // The confidence interval.
  ConfidenceLevel confidence_level;

  // If multilang is enabled, describes the actions Automatic Speech Recognition
  // took as a result of this event.
  [MinVersion=1]
  AsrSwitchResult? asr_switch_result;
};

// The interface used to notify the speech recognition client of events
// triggered by the browser. The remote lives in the browser process and the
// receiver lives either in the renderer process (for web Live Caption) or
// the browser process (for CrOS system Live Caption).
[Stable]
interface SpeechRecognitionBrowserObserver {
  // Notify the speech recognition client when speech recognition availability
  // changes.
  SpeechRecognitionAvailabilityChanged@0(
      bool is_speech_recognition_available);

  // Notify the speech recognition client when the speech recognition language
  // changes.
  SpeechRecognitionLanguageChanged@1(string language);

  // Notify the speech recognition client when the mask offensive words pref
  // changes.
  [MinVersion=2]
  SpeechRecognitionMaskOffensiveWordsChanged@2(bool mask_offensive_words);
};

// The user-facing source of recognized speech; typically a tab. The remote
// lives in the Ash browser process and is used to trigger behavior in lacros
// (like focusing the tab). The receiver lives in the lacros browser process.
[Stable]
interface SpeechRecognitionSurface {
  // "Activate" the surface - i.e. bring it to the front and focus it.
  Activate@0();

  // Fetch the bounds of the surface in screen coordinates. A nullopt is
  // returned if no bounds could be fetched.
  GetBounds@1() => (gfx.mojom.Rect? bounds);
};

// The OS-side observer of a lacros-side speech surface. Used to close or
// re-render a live caption bubble based on user interaction with the
// lacros-side surface. The remote lives in the lacros browser process, and the
// receiver lives in the Ash browser process.
[Stable]
interface SpeechRecognitionSurfaceClient {
  // Called when the user navigates away or refreshes the current tab. This
  // comprises the end of a live caption "session", after which the caption
  // bubble can be shown even if it was explicitly dismissed by the user.
  OnSessionEnded@0();

  // Called when the user fullscreens or un-fullscreens the speech surface.
  OnFullscreenToggled@1();
};

// This interface between the speech recognition client and the browser.
// The remote lives in the renderer process and the receiver lives in the
// browser process. Not necessary for browser-side features (e.g. CrOS system
// Live Caption), which can access browser functionality directly.
[Stable]
interface SpeechRecognitionClientBrowserInterface {
  // Bind the speech recognition availability observer.
  BindSpeechRecognitionBrowserObserver@0(
      pending_remote<SpeechRecognitionBrowserObserver> observer);

  [MinVersion=1]
  REMOVED_1@1();

  // Similar to BindSpeechRecognitionBrowserObserver, however binds Browser
  // observers that listen to events specific to BabelOrca rather than
  // the traditional LiveCaption preference.
  [MinVersion=7, EnableIf=is_chromeos_ash]
  BindBabelOrcaSpeechRecognitionBrowserObserver@2(
      pending_remote<SpeechRecognitionBrowserObserver> observer);
};

// Corresponds to ExtendedSodaConfigMsg.RecognitionMode in
// chrome/services/speech/soda/proto/soda_api.proto and
// SodaRecognitionMode in
// chromeos/services/machine_learning/public/mojom/soda.mojom.
[Stable, Extensible]
enum SpeechRecognitionMode {
  [Default] kUnknown,
  // Intended for voice input for keyboard usage.
  kIme,
  // Intended to caption a stream of audio.
  kCaption,
};

// Which Chrome/ChromeOS application that is triggering the
// speech recognition session to start.
[Stable, Extensible]
enum RecognizerClientType {
  [Default] kUnknown,
  // Dictation on ChromeOS.
  kDictation,
  // LiveCaption on Chrome/ChromeOS.
  kLiveCaption,
  // Projector on ChromeOS.
  kProjector,
  // CastModerator on ChromeOS.
  kCastModerator,
};

// Options for speech recognition.
[Stable]
struct SpeechRecognitionOptions {
  // What kind of recognition to use.
  // In the case of web fallback (not for launch, used for development only),
  // this option will be ignored.
  SpeechRecognitionMode recognition_mode;

  // Whether to enable formatting and punctuation in the recognition results.
  bool enable_formatting;

  // The BCP-47 localized language code to use (e.g. "en-US").
  // TODO(crbug.com/40162502): Language needs to be required when multiple
  // languages are supported by SODA, so that each SpeechRecognitionRecognizer
  // can use its own language.
  string? language;

  // Whether the recognition is happening on-device or remotely on a server.
  [MinVersion=1]
  bool is_server_based;

  // Which client is requesting the speech recognition session.
  [MinVersion=1]
  RecognizerClientType recognizer_client_type;

  // When true, if the incoming audio buffer is zero for an extended period
  // (e.g. 10 seconds), audio won't be fed to the captioning model until nonzero
  // audio is received.
  // When false, even empty audio is captioned indefinitely.
  // Set to false if accurate TimingInfo relative to the start of captioning is
  // needed.
  [MinVersion=4]
  bool skip_continuously_empty_audio = false;

  // The optional experiment recognizer routing key for current request.
  [MinVersion=5]
  string? experiment_recognizer_routing_key;

  // The channel count of the forwarded audio.
  [MinVersion=6]
  int32 channel_count;

  // The sample rate of the forwarded audio.
  [MinVersion=6]
  int32 sample_rate;

  // The optional recognition context for speech recognition biasing.
  [MinVersion=8]
  SpeechRecognitionRecognitionContext? recognition_context;
};