1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
media / mojo / mojom / speech_recognition.mojom [blame]
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
module media.mojom;
import "media/mojo/mojom/audio_data.mojom";
import "media/mojo/mojom/speech_recognition_audio_forwarder.mojom";
import "media/mojo/mojom/speech_recognition_recognition_context.mojom";
import "media/mojo/mojom/speech_recognizer.mojom";
import "mojo/public/mojom/base/time.mojom";
import "mojo/public/mojom/base/unguessable_token.mojom";
import "ui/gfx/geometry/mojom/geometry.mojom";
// Next MinVersion: 9
// Corresponds to the LangIdEvent.ConfidenceInterval defined in
// http://google3/speech/soda/public/soda_event.proto.
[Stable, Extensible]
enum ConfidenceLevel {
[Default] kUnknown,
kNotConfident,
kConfident,
kHighlyConfident,
};
// Corresponds to the LangIdEvent.AsrSwitchResult defined in
// http://google3/speech/soda/public/soda_event.proto.
[Stable, Extensible]
enum AsrSwitchResult {
[Default] kDefaultNoSwitch,
kSwitchSucceeded,
kSwitchFailed,
kSwitchSkipedNoLp,
};
// The main interface a client uses to interact with a speech
// recognition service process. For Live Caption, every renderer can own one
// or more Remote<SpeechRecognitionContext>, with the receiver bound through the
// BrowserInterfaceBroker. For the Web Speech API, the browser is the client and
// can own one or more Remote<SpeechRecognitionContext>. This is a stable
// interface that is used across the LaCrOS/Ash boundary.
[Stable]
interface SpeechRecognitionContext {
// Bind the recognizers to the speech recognition service. Returns a flag
// indicating whether multichannel audio is supported by the speech
// recognition service.
BindRecognizer@0(pending_receiver<SpeechRecognitionRecognizer> receiver,
pending_remote<SpeechRecognitionRecognizerClient> client,
SpeechRecognitionOptions options)
=> (bool is_multichannel_supported);
// Bind the recognizer used by the Web Speech API.
[MinVersion=6]
BindWebSpeechRecognizer@1(
pending_receiver<media.mojom.SpeechRecognitionSession> session_receiver,
pending_remote<media.mojom.SpeechRecognitionSessionClient> session_client,
pending_receiver<SpeechRecognitionAudioForwarder> audio_forwarder,
int32 channel_count,
int32 sample_rate,
SpeechRecognitionOptions options,
bool continuous);
};
// The interface used to pass raw audio from the renderer to the speech
// recognition service. The remote lives either in the renderer process (for web
// Live Caption) or the browser process (for CrOS features like system Live
// Caption and dictation) and the receiver lives in the speech recognition
// process.
[Stable]
interface SpeechRecognitionRecognizer {
// Initialize the speech recognition instance. The speech recognition client
// will return the recognition events containing the transcribed audio back
// to the originating media.
SendAudioToSpeechRecognitionService@0(AudioDataS16 buffer);
// Mark audio stream done. This informs the speech recognition client to stop
// speech recognition after it finishes processing the audio it has received
// already. This will eventually trigger the
// SpeechRecognitionRecognizerClient::OnSpeechRecognitionStopped callback.
MarkDone@1();
// Notify the speech recognition recognizer that the language changed. Takes
// in the locale string (e.g. "en-US").
OnLanguageChanged@2(string language);
// Notify the speech recognition recognizer that the mask offensive words
// setting has changed.
[MinVersion=2]
OnMaskOffensiveWordsChanged@3(bool mask_offensive_words);
};
// The interface used to return speech recognition events from the speech
// recognition service to the client that will display the results to the user.
// The remote lives in the speech recognition process and the receiver lives in
// the browser process.
[Stable]
interface SpeechRecognitionRecognizerClient {
// Triggered by speech recognition process on a speech recognition event.
//
// Returns false if the client wants to halt speech recognition e.g. in
// response to user input or in the case of an error.
OnSpeechRecognitionRecognitionEvent@0(SpeechRecognitionResult result)
=> (bool continue_recognition);
// Called when speech recognition stops.
OnSpeechRecognitionStopped@1();
// Triggered by an error within the speech recognition service.
OnSpeechRecognitionError@2();
// Triggered by speech recognition process on a language identification event.
OnLanguageIdentificationEvent@3(LanguageIdentificationEvent event);
};
// The hypothesis parts that provides timing information for each word in
// recognized speech.
[Stable]
struct HypothesisParts {
// A section of the final transcription text. Either an entire word or single
// character (depending on the language) with adjacent punctuation. There will
// usually only be one value here. If formatting is enabled in the speech
// recognition, then the raw text will be included as the second element.
array<string> text;
// Time offset from this event's |audio_start_time| defined below. We enforce
// the following invariant: 0 <= hypothesis_part_offset < |audio_end_time -
// audio_start_time|.
mojo_base.mojom.TimeDelta hypothesis_part_offset;
};
// The timing information for the transcript.
[Stable]
struct TimingInformation {
// Start time in audio time from the start of the SODA session.
// This time measures the amount of audio input into SODA.
mojo_base.mojom.TimeDelta audio_start_time;
// Elapsed processed audio from first frame after preamble.
mojo_base.mojom.TimeDelta audio_end_time;
// The timing information for each word/letter in the transription.
// HypothesisPartsInResult was introduced in min version 1 in
// chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
// must be optional. Hypothesis parts maybe non-empty optional containing a
// zero length vector if no words were spoken during the event's time span.
array<HypothesisParts>? hypothesis_parts;
};
// A speech recognition result created by the speech service and passed to the
// browser.
[Stable]
struct SpeechRecognitionResult {
string transcription;
// A flag indicating whether the result is final. If true, the result is
// locked in and the next result returned will not overlap with the previous
// final result.
bool is_final;
// Timing information for the current transcription. |timing_information| is
// expected to be valid if:
// 1. speech recognition is provided by |CrosSodaClient| and
// 2. |is_final| is true.
TimingInformation? timing_information;
};
// A language identification event created by the speech recognition service
// and passed to the browser and renderer.
[Stable]
struct LanguageIdentificationEvent {
// The locale of the language with the highest confidence.
string language;
// The confidence interval.
ConfidenceLevel confidence_level;
// If multilang is enabled, describes the actions Automatic Speech Recognition
// took as a result of this event.
[MinVersion=1]
AsrSwitchResult? asr_switch_result;
};
// The interface used to notify the speech recognition client of events
// triggered by the browser. The remote lives in the browser process and the
// receiver lives either in the renderer process (for web Live Caption) or
// the browser process (for CrOS system Live Caption).
[Stable]
interface SpeechRecognitionBrowserObserver {
// Notify the speech recognition client when speech recognition availability
// changes.
SpeechRecognitionAvailabilityChanged@0(
bool is_speech_recognition_available);
// Notify the speech recognition client when the speech recognition language
// changes.
SpeechRecognitionLanguageChanged@1(string language);
// Notify the speech recognition client when the mask offensive words pref
// changes.
[MinVersion=2]
SpeechRecognitionMaskOffensiveWordsChanged@2(bool mask_offensive_words);
};
// The user-facing source of recognized speech; typically a tab. The remote
// lives in the Ash browser process and is used to trigger behavior in lacros
// (like focusing the tab). The receiver lives in the lacros browser process.
[Stable]
interface SpeechRecognitionSurface {
// "Activate" the surface - i.e. bring it to the front and focus it.
Activate@0();
// Fetch the bounds of the surface in screen coordinates. A nullopt is
// returned if no bounds could be fetched.
GetBounds@1() => (gfx.mojom.Rect? bounds);
};
// The OS-side observer of a lacros-side speech surface. Used to close or
// re-render a live caption bubble based on user interaction with the
// lacros-side surface. The remote lives in the lacros browser process, and the
// receiver lives in the Ash browser process.
[Stable]
interface SpeechRecognitionSurfaceClient {
// Called when the user navigates away or refreshes the current tab. This
// comprises the end of a live caption "session", after which the caption
// bubble can be shown even if it was explicitly dismissed by the user.
OnSessionEnded@0();
// Called when the user fullscreens or un-fullscreens the speech surface.
OnFullscreenToggled@1();
};
// This interface between the speech recognition client and the browser.
// The remote lives in the renderer process and the receiver lives in the
// browser process. Not necessary for browser-side features (e.g. CrOS system
// Live Caption), which can access browser functionality directly.
[Stable]
interface SpeechRecognitionClientBrowserInterface {
// Bind the speech recognition availability observer.
BindSpeechRecognitionBrowserObserver@0(
pending_remote<SpeechRecognitionBrowserObserver> observer);
[MinVersion=1]
REMOVED_1@1();
// Similar to BindSpeechRecognitionBrowserObserver, however binds Browser
// observers that listen to events specific to BabelOrca rather than
// the traditional LiveCaption preference.
[MinVersion=7, EnableIf=is_chromeos_ash]
BindBabelOrcaSpeechRecognitionBrowserObserver@2(
pending_remote<SpeechRecognitionBrowserObserver> observer);
};
// Corresponds to ExtendedSodaConfigMsg.RecognitionMode in
// chrome/services/speech/soda/proto/soda_api.proto and
// SodaRecognitionMode in
// chromeos/services/machine_learning/public/mojom/soda.mojom.
[Stable, Extensible]
enum SpeechRecognitionMode {
[Default] kUnknown,
// Intended for voice input for keyboard usage.
kIme,
// Intended to caption a stream of audio.
kCaption,
};
// Which Chrome/ChromeOS application that is triggering the
// speech recognition session to start.
[Stable, Extensible]
enum RecognizerClientType {
[Default] kUnknown,
// Dictation on ChromeOS.
kDictation,
// LiveCaption on Chrome/ChromeOS.
kLiveCaption,
// Projector on ChromeOS.
kProjector,
// CastModerator on ChromeOS.
kCastModerator,
};
// Options for speech recognition.
[Stable]
struct SpeechRecognitionOptions {
// What kind of recognition to use.
// In the case of web fallback (not for launch, used for development only),
// this option will be ignored.
SpeechRecognitionMode recognition_mode;
// Whether to enable formatting and punctuation in the recognition results.
bool enable_formatting;
// The BCP-47 localized language code to use (e.g. "en-US").
// TODO(crbug.com/40162502): Language needs to be required when multiple
// languages are supported by SODA, so that each SpeechRecognitionRecognizer
// can use its own language.
string? language;
// Whether the recognition is happening on-device or remotely on a server.
[MinVersion=1]
bool is_server_based;
// Which client is requesting the speech recognition session.
[MinVersion=1]
RecognizerClientType recognizer_client_type;
// When true, if the incoming audio buffer is zero for an extended period
// (e.g. 10 seconds), audio won't be fed to the captioning model until nonzero
// audio is received.
// When false, even empty audio is captioned indefinitely.
// Set to false if accurate TimingInfo relative to the start of captioning is
// needed.
[MinVersion=4]
bool skip_continuously_empty_audio = false;
// The optional experiment recognizer routing key for current request.
[MinVersion=5]
string? experiment_recognizer_routing_key;
// The channel count of the forwarded audio.
[MinVersion=6]
int32 channel_count;
// The sample rate of the forwarded audio.
[MinVersion=6]
int32 sample_rate;
// The optional recognition context for speech recognition biasing.
[MinVersion=8]
SpeechRecognitionRecognitionContext? recognition_context;
};