1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
media / mojo / mojom / speech_recognizer.mojom [blame]
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
module media.mojom;
import "media/mojo/mojom/speech_recognition_audio_forwarder.mojom";
import "media/mojo/mojom/speech_recognition_error.mojom";
import "media/mojo/mojom/speech_recognition_grammar.mojom";
import "media/mojo/mojom/speech_recognition_result.mojom";
import "media/mojo/mojom/speech_recognition_recognition_context.mojom";
// Created by the renderer and sent to the browser to start a speech recognition
// session.
struct StartSpeechRecognitionRequestParams {
// Used to create a connection with a SpeechRecognitionSession implementation
// that will be created when the session is created.
pending_receiver<SpeechRecognitionSession> session_receiver;
// Used by the browser to communicate with a SpeechRecognitionSessionClient
// implementation created for the new session.
pending_remote<SpeechRecognitionSessionClient> client;
// Language to use for speech recognition.
string language;
// Speech grammars to use.
array<SpeechRecognitionGrammar> grammars;
// The optional recognition context for speech recognition biasing.
SpeechRecognitionRecognitionContext? recognition_context;
// Maximum number of hypotheses allowed for each results.
uint32 max_hypotheses;
// Whether the user requested continuous recognition.
bool continuous;
// Whether the user requested interim results.
bool interim_results;
// Whether the speech recognition may happen on-device.
bool on_device;
// Whether the speech recognition is allowed to fallback to a Cloud-based
// speech recognition service.
bool allow_cloud_fallback;
// Used to pass audio from the renderer to the browser.
pending_receiver<media.mojom.SpeechRecognitionAudioForwarder>?
audio_forwarder;
// The channel count of the forwarded audio.
int32 channel_count;
// The sample rate of the forwarded audio.
int32 sample_rate;
};
// API for the renderer process to manage speech recognition in the browser
// process. The remote lives in the renderer process and the receiver lives in
// the browser process.
interface SpeechRecognizer {
// Requests the speech recognition service to start speech recognition.
Start(StartSpeechRecognitionRequestParams params);
};
// API for the renderer process to manage on-device speech recognition in the
// browser process. The remote lives in the renderer process and the receiver
// lives in
// the browser process.
interface OnDeviceSpeechRecognition {
// Returns whether on-device speech recognition is available for a given
// language. Takes in a BCP 47 language tag (e.g. "en-US").
OnDeviceWebSpeechAvailable(string language) => (bool available);
// Returns whether on-device speech recognition installation was successfully
// initiated for the given language. Takes in a BCP 47 language tag (e.g.
// "en-US").
InstallOnDeviceSpeechRecognition(string language) => (bool success);
};
// API for the renderer process to stop or abort an existing speech recognition
// session. An InterfaceRequest is sent to the browser process via
// SpeechRecognizer::Start, and is bound to an implementation there.
// SpeechRecognitionSession and SpeechRecognitionSessionClient are 1:1 with each
// other and with WebSpeechRecognitionHandle.
[Stable]
interface SpeechRecognitionSession {
// Requests the speech recognition service to abort speech recognition for the
// associated session.
Abort@0();
// Requests the speech recognition service to stop audio capture for the
// associated session.
StopCapture@1();
};
// API for the browser process to communicate speech recognition related updates
// with renderer and cause events to be dispatched to the appropriate speech
// recognition handle. An InterfacePtr for each handle is sent to the browser
// process via SpeechRecognizer::Start. SpeechRecognitionSession and
// SpeechRecognitionSessionClient are 1:1 with each other and with
// WebSpeechRecognitionHandle.
[Stable]
interface SpeechRecognitionSessionClient {
// Called to dispatch the "result" event.
ResultRetrieved@0(array<WebSpeechRecognitionResult> results);
// Called to dispatch the "nomatch" event if the error code passed is of types
// kNoMatch, otherwise dispatchers an "error" event.
ErrorOccurred@1(SpeechRecognitionError error);
// Called to dispatch the "start" event.
Started@2();
// Called to dispatch the "audiostart" event.
AudioStarted@3();
// Called to dispatch the "soundstart" and "speechstart" events.
SoundStarted@4();
// Called to dispatch "soundend" and "speechend" events.
SoundEnded@5();
// Called to dispatch the "audioend" event.
AudioEnded@6();
// Called to dispatch the "end" event.
Ended@7();
};