1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
content / browser / speech / network_speech_recognition_engine_impl.h [blame]
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_
#define CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_
#include <stdint.h>
#include <memory>
#include <string>
#include <string_view>
#include <vector>
#include "base/memory/raw_ptr.h"
#include "base/memory/ref_counted.h"
#include "base/sequence_checker.h"
#include "components/speech/audio_encoder.h"
#include "components/speech/chunked_byte_buffer.h"
#include "components/speech/downstream_loader.h"
#include "components/speech/downstream_loader_client.h"
#include "components/speech/upstream_loader.h"
#include "components/speech/upstream_loader_client.h"
#include "content/browser/speech/speech_recognition_engine.h"
#include "content/common/content_export.h"
#include "content/public/browser/speech_recognition_session_preamble.h"
#include "media/mojo/mojom/speech_recognition_error.mojom.h"
#include "media/mojo/mojom/speech_recognition_grammar.mojom.h"
#include "media/mojo/mojom/speech_recognition_result.mojom.h"
#include "services/network/public/cpp/simple_url_loader_stream_consumer.h"
class AudioChunk;
namespace base {
class TimeDelta;
}
namespace network {
class SharedURLLoaderFactory;
}
namespace content {
// This is the network implementation for `SpeechRecognitionEngine`, which is
// supporting continuous recognition by means of interaction with the Google
// streaming speech recognition webservice.
//
// This class establishes two HTTPS connections with the webservice for each
// session, herein called "upstream" and "downstream". Audio chunks are sent on
// the upstream by means of a chunked HTTP POST upload. Recognition results are
// retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream)
// on the downstream by means of a chunked HTTP GET request. Pairing between the
// two stream is handled through a randomly generated key, unique for each
// request, which is passed in the &pair= arg to both stream request URLs. In
// the case of a regular session, the upstream is closed when the audio capture
// ends (notified through a |AudioChunksEnded| call) and the downstream waits
// for a corresponding server closure (eventually some late results can come
// after closing the upstream). Both streams are guaranteed to be closed when
// |EndRecognition| call is issued.
class CONTENT_EXPORT NetworkSpeechRecognitionEngineImpl
: public SpeechRecognitionEngine,
public speech::UpstreamLoaderClient,
public speech::DownstreamLoaderClient {
public:
// Network engine configuration.
struct CONTENT_EXPORT Config {
Config();
~Config();
std::string language;
std::vector<media::mojom::SpeechRecognitionGrammar> grammars;
bool filter_profanities = false;
bool continuous = true;
bool interim_results = true;
uint32_t max_hypotheses;
std::string origin_url;
int audio_sample_rate;
int audio_num_bits_per_sample;
std::string auth_token;
std::string auth_scope;
scoped_refptr<SpeechRecognitionSessionPreamble> preamble;
};
// Duration of each audio packet.
static const int kAudioPacketIntervalMs;
// |accept_language| is the default Accept-Language header.
NetworkSpeechRecognitionEngineImpl(
scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory,
const std::string& accept_language);
NetworkSpeechRecognitionEngineImpl(
const NetworkSpeechRecognitionEngineImpl&) = delete;
NetworkSpeechRecognitionEngineImpl& operator=(
const NetworkSpeechRecognitionEngineImpl&) = delete;
~NetworkSpeechRecognitionEngineImpl() override;
// Sets the URL requests are sent to for tests.
static void set_web_service_base_url_for_tests(
const char* base_url_for_tests);
void SetConfig(const Config& config);
bool IsRecognitionPending() const;
// content::SpeechRecognitionEngine:
void StartRecognition() override;
void EndRecognition() override;
void TakeAudioChunk(const AudioChunk& data) override;
void AudioChunksEnded() override;
int GetDesiredAudioChunkDurationMs() const override;
private:
friend class speech::UpstreamLoaderClient;
friend class speech::DownstreamLoader;
// Response status codes from the speech recognition webservice.
static const int kWebserviceStatusNoError;
static const int kWebserviceStatusErrorNoMatch;
// Frame type for framed POST data. Do NOT change these. They must match
// values the server expects.
enum FrameType { FRAME_PREAMBLE_AUDIO = 0, FRAME_RECOGNITION_AUDIO = 1 };
// Data types for the internal Finite State Machine (FSM).
enum FSMState {
STATE_IDLE = 0,
STATE_BOTH_STREAMS_CONNECTED,
STATE_WAITING_DOWNSTREAM_RESULTS,
STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
};
enum FSMEvent {
EVENT_END_RECOGNITION = 0,
EVENT_START_RECOGNITION,
EVENT_AUDIO_CHUNK,
EVENT_AUDIO_CHUNKS_ENDED,
EVENT_UPSTREAM_ERROR,
EVENT_DOWNSTREAM_ERROR,
EVENT_DOWNSTREAM_RESPONSE,
EVENT_DOWNSTREAM_CLOSED,
EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
};
struct FSMEventArgs {
explicit FSMEventArgs(FSMEvent event_value);
FSMEventArgs(const FSMEventArgs&) = delete;
FSMEventArgs& operator=(const FSMEventArgs&) = delete;
~FSMEventArgs();
FSMEvent event;
// In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
scoped_refptr<const AudioChunk> audio_data;
// In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
std::unique_ptr<std::vector<uint8_t>> response;
};
// speech::UpstreamLoaderClient
void OnUpstreamDataComplete(bool success, int response_code) override;
// speech::DownstreamLoaderClient
void OnDownstreamDataReceived(std::string_view new_response_data) override;
void OnDownstreamDataComplete(bool success, int response_code) override;
// Entry point for pushing any new external event into the recognizer FSM.
void DispatchEvent(const FSMEventArgs& event_args);
// Defines the behavior of the recognizer FSM, selecting the appropriate
// transition according to the current state and event.
FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);
// The methods below handle transitions of the recognizer FSM.
FSMState ConnectBothStreams(const FSMEventArgs& event_args);
FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
FSMState CloseDownstream(const FSMEventArgs& event_args);
FSMState AbortSilently(const FSMEventArgs& event_args);
FSMState AbortWithError(const FSMEventArgs& event_args);
FSMState Abort(media::mojom::SpeechRecognitionErrorCode error);
FSMState DoNothing(const FSMEventArgs& event_args);
FSMState NotFeasible(const FSMEventArgs& event_args);
std::string GetAcceptedLanguages() const;
std::string GenerateRequestKey() const;
// Upload a single chunk of audio data. Handles both unframed and framed
// upload formats, and uses the appropriate one.
void UploadAudioChunk(const std::string& data, FrameType type, bool is_final);
// The total audio duration of the upstream request.
base::TimeDelta upstream_audio_duration_;
Config config_;
std::unique_ptr<speech::UpstreamLoader> upstream_loader_;
std::unique_ptr<speech::DownstreamLoader> downstream_loader_;
scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory_;
const std::string accept_language_;
std::unique_ptr<AudioEncoder> encoder_;
std::unique_ptr<AudioEncoder> preamble_encoder_;
speech::ChunkedByteBuffer chunked_byte_buffer_;
bool got_last_definitive_result_ = false;
bool is_dispatching_event_ = false;
bool use_framed_post_data_ = false;
FSMState state_ = FSMState::STATE_IDLE;
SEQUENCE_CHECKER(sequence_checker_);
};
} // namespace content
#endif // CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_