1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
content / browser / speech / tts_mac.mm [blame]
// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#import "content/browser/speech/tts_mac.h"
#import <AVFAudio/AVFAudio.h>
#import <AppKit/AppKit.h>
#include <objc/runtime.h>
#include <algorithm>
#include <string>
#include "base/apple/foundation_util.h"
#include "base/functional/bind.h"
#include "base/memory/raw_ptr.h"
#include "base/no_destructor.h"
#include "base/strings/sys_string_conversions.h"
#include "base/values.h"
#include "content/public/browser/tts_controller.h"
namespace {
constexpr int kNoLength = -1;
constexpr char kNoError[] = "";
std::vector<content::VoiceData>& VoicesRef() {
static base::NoDestructor<std::vector<content::VoiceData>> voices([]() {
[NSNotificationCenter.defaultCenter
addObserverForName:NSApplicationWillBecomeActiveNotification
object:nil
queue:nil
usingBlock:^(NSNotification* notification) {
// The user might have switched to Settings or some other app
// to change voices or locale settings. Avoid a stale cache by
// forcing a rebuild of the voices vector after the app
// becomes active.
VoicesRef().clear();
}];
return std::vector<content::VoiceData>();
}());
return *voices;
}
AVSpeechSynthesisVoice* GetSystemDefaultVoice() {
// This should be
//
// [AVSpeechSynthesisVoice voiceWithLanguage:nil]
//
// but that has a bug (https://crbug.com/1484940#c9, FB13197951). In short,
// while passing nil to -[AVSpeechSynthesisVoice voiceWithLanguage:] does
// indeed return "the default voice for the system’s language and region",
// that's not necessarily the voice that the user selected in System Settings
// > Accessibility > Spoken Content, and that user voice selection is the only
// one that matters. There does not appear to be an AVSpeechSynthesis API that
// returns that user choice, so use the deprecated NSSpeechSynthesizer API,
// which behaves correctly.
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
NSString* default_voice_identifier = NSSpeechSynthesizer.defaultVoice;
#pragma clang diagnostic pop
return [AVSpeechSynthesisVoice voiceWithIdentifier:default_voice_identifier];
}
std::vector<content::VoiceData>& Voices() {
std::vector<content::VoiceData>& voices = VoicesRef();
if (!voices.empty()) {
return voices;
}
NSMutableArray* av_speech_voices =
[[AVSpeechSynthesisVoice.speechVoices sortedArrayUsingDescriptors:@[
[NSSortDescriptor sortDescriptorWithKey:@"name" ascending:YES]
]] mutableCopy];
AVSpeechSynthesisVoice* default_voice = GetSystemDefaultVoice();
if (default_voice) {
[av_speech_voices removeObject:default_voice];
[av_speech_voices insertObject:default_voice atIndex:0];
}
// For the case of multiple voices with the same name but of a different
// language, the old API (NSSpeechSynthesizer) would append locale information
// to the names, while this current API does not. Because returning a bunch of
// voices with the same name isn't helpful, count how often each name is used,
// so that later on, locale information can be appended if necessary for
// disambiguation.
NSMutableDictionary<NSString*, NSNumber*>* name_counts =
[NSMutableDictionary dictionary];
for (AVSpeechSynthesisVoice* av_speech_voice in av_speech_voices) {
NSString* voice_name = av_speech_voice.name;
if (!voice_name) {
// AVSpeechSynthesisVoice.name is not a nullable property, but there are
// crashes (https://crbug.com/1459235) where -setObject:forKeyedSubscript:
// is being passed a nil key, and the only place that happens in this
// function is below.
continue;
}
if (NSNumber* count = name_counts[voice_name]) {
name_counts[voice_name] = @(count.intValue + 1);
} else {
name_counts[voice_name] = @1;
}
}
voices.reserve(av_speech_voices.count);
for (AVSpeechSynthesisVoice* av_speech_voice in av_speech_voices) {
NSString* voice_name = av_speech_voice.name;
if (!voice_name) {
// AVSpeechSynthesisVoice.name is not a nullable property, but there are
// crashes (https://crbug.com/1459235) where it seems like it's returning
// nil. Without a name, a voice is useless, so skip it.
continue;
}
voices.emplace_back();
content::VoiceData& data = voices.back();
if (name_counts[voice_name].intValue > 1) {
// The language property on a voice is a BCP 47 code (i.e. "en-US") while
// an NSLocale locale identifier isn't (i.e. "en_US"). However, using the
// BCP 47 code as if it were a locale identifier works just fine (tested
// back to 10.15).
NSString* localized_language = [NSLocale.autoupdatingCurrentLocale
localizedStringForLocaleIdentifier:av_speech_voice.language];
voice_name = [NSString
stringWithFormat:@"%@ (%@)", voice_name, localized_language];
}
data.native = true;
data.native_voice_identifier =
base::SysNSStringToUTF8(av_speech_voice.identifier);
data.name = base::SysNSStringToUTF8(voice_name);
data.lang = base::SysNSStringToUTF8(av_speech_voice.language);
data.events.insert(content::TTS_EVENT_START);
data.events.insert(content::TTS_EVENT_END);
data.events.insert(content::TTS_EVENT_WORD);
data.events.insert(content::TTS_EVENT_PAUSE);
data.events.insert(content::TTS_EVENT_RESUME);
}
return voices;
}
AVSpeechUtterance* MakeUtterance(int utterance_id,
const std::string& utterance_string) {
AVSpeechUtterance* utterance = [AVSpeechUtterance
speechUtteranceWithString:base::SysUTF8ToNSString(utterance_string)];
objc_setAssociatedObject(utterance, @selector(identifier), @(utterance_id),
OBJC_ASSOCIATION_RETAIN);
return utterance;
}
int GetUtteranceId(AVSpeechUtterance* utterance) {
NSNumber* identifier = base::apple::ObjCCast<NSNumber>(
objc_getAssociatedObject(utterance, @selector(identifier)));
if (identifier) {
return identifier.intValue;
}
return TtsPlatformImplMac::kInvalidUtteranceId;
}
} // namespace
// static
content::TtsPlatformImpl* content::TtsPlatformImpl::GetInstance() {
return TtsPlatformImplMac::GetInstance();
}
TtsPlatformImplMac::~TtsPlatformImplMac() = default;
bool TtsPlatformImplMac::PlatformImplSupported() {
return true;
}
bool TtsPlatformImplMac::PlatformImplInitialized() {
return true;
}
void TtsPlatformImplMac::Speak(
int utterance_id,
const std::string& utterance,
const std::string& lang,
const content::VoiceData& voice,
const content::UtteranceContinuousParameters& params,
base::OnceCallback<void(bool)> on_speak_finished) {
// Parse SSML and process speech. TODO(crbug.com/40273591):
// AVSpeechUtterance has an initializer -initWithSSMLRepresentation:. Should
// that be used instead?
content::TtsController::GetInstance()->StripSSML(
utterance, base::BindOnce(&TtsPlatformImplMac::ProcessSpeech,
base::Unretained(this), utterance_id, lang,
voice, params, std::move(on_speak_finished)));
}
void TtsPlatformImplMac::ProcessSpeech(
int utterance_id,
const std::string& lang,
const content::VoiceData& voice,
const content::UtteranceContinuousParameters& params,
base::OnceCallback<void(bool)> on_speak_finished,
const std::string& parsed_utterance) {
utterance_ = parsed_utterance;
paused_ = false;
utterance_id_ = utterance_id;
AVSpeechUtterance* speech_utterance =
MakeUtterance(utterance_id, parsed_utterance);
if (!speech_utterance) {
std::move(on_speak_finished).Run(false);
return;
}
speech_utterance.voice = [AVSpeechSynthesisVoice
voiceWithIdentifier:base::SysUTF8ToNSString(
voice.native_voice_identifier)];
if (params.rate >= 0.0) {
// The two relevant APIs have different ranges:
// - Web Speech API is [.1, 10] with default 1
// - AVSpeechSynthesizer is [0, 1] with default .5
//
// Speeds in the Web Speech API other than 1 (the default rate) are meant to
// be multiples of the default speaking rate.
//
// The mapping of AVSpeechSynthesizer speeds was done experimentally, using
// the fourth paragraph of _A Tale of Two Cities_. With the "Samantha"
// voice, AVSpeechUtteranceDefaultSpeechRate takes about 80s to read the
// paragraph, while AVSpeechUtteranceMaximumSpeechRate takes about 20s.
// Therefore, map
//
// 1 → AVSpeechUtteranceDefaultSpeechRate
// 4 → AVSpeechUtteranceMaximumSpeechRate
//
// and cap anything higher.
//
// References:
//
// https://developer.mozilla.org/en-US/docs/Web/API/SpeechSynthesisUtterance/rate
// https://github.com/WebKit/WebKit/blob/main/Source/WebCore/platform/cocoa/PlatformSpeechSynthesizerCocoa.mm
// ^ This is the WebKit implementation. It appears to have a bug in
// scaling, where a Web Speech API rate of 2 is scaled to
// AVSpeechUtteranceMaximumSpeechRate and the value passed to the
// AVSpeechSynthesizer goes up from there. A bug was filed about this:
// https://bugs.webkit.org/show_bug.cgi?id=258587
float rate = params.rate;
if (rate < 1) {
// If a slower than normal rate is requested, scale the default speech
// rate down proportionally.
rate *= AVSpeechUtteranceDefaultSpeechRate;
} else {
// Scale the AVSpeech rate headroom proportionally to match the excess
// above 1 in the Speech API, capping at a Web Speech API value of 4.
const float kWebSpeechDefault = 1;
const float kWebSpeechMaxSupported = 4;
const float kAVSpeechRateHeadroom = AVSpeechUtteranceMaximumSpeechRate -
AVSpeechUtteranceDefaultSpeechRate;
const float excess = rate - kWebSpeechDefault;
const float capped_excess =
std::min(excess, (kWebSpeechMaxSupported - kWebSpeechDefault));
const float headroom_proportion =
capped_excess / (kWebSpeechMaxSupported - kWebSpeechDefault);
rate = AVSpeechUtteranceDefaultSpeechRate +
headroom_proportion * kAVSpeechRateHeadroom;
}
speech_utterance.rate = rate;
}
if (params.pitch >= 0.0) {
speech_utterance.pitchMultiplier = params.pitch;
}
if (params.volume >= 0.0) {
speech_utterance.volume = params.volume;
}
[speech_synthesizer_ speakUtterance:speech_utterance];
std::move(on_speak_finished).Run(true);
}
bool TtsPlatformImplMac::StopSpeaking() {
[speech_synthesizer_ stopSpeakingAtBoundary:AVSpeechBoundaryImmediate];
paused_ = false;
return true;
}
void TtsPlatformImplMac::Pause() {
if (!paused_) {
[speech_synthesizer_ pauseSpeakingAtBoundary:AVSpeechBoundaryImmediate];
paused_ = true;
content::TtsController::GetInstance()->OnTtsEvent(
utterance_id_, content::TTS_EVENT_PAUSE, last_char_index_, kNoLength,
kNoError);
}
}
void TtsPlatformImplMac::Resume() {
if (paused_) {
[speech_synthesizer_ continueSpeaking];
paused_ = false;
content::TtsController::GetInstance()->OnTtsEvent(
utterance_id_, content::TTS_EVENT_RESUME, last_char_index_, kNoLength,
kNoError);
}
}
bool TtsPlatformImplMac::IsSpeaking() {
return speech_synthesizer_.speaking;
}
void TtsPlatformImplMac::GetVoices(std::vector<content::VoiceData>* outVoices) {
*outVoices = Voices();
}
void TtsPlatformImplMac::OnSpeechEvent(int utterance_id,
content::TtsEventType event_type,
int char_index,
int char_length,
const std::string& error_message) {
// Don't send events from an utterance that's already completed.
if (utterance_id != utterance_id_) {
return;
}
if (event_type == content::TTS_EVENT_END) {
char_index = utterance_.size();
}
content::TtsController::GetInstance()->OnTtsEvent(
utterance_id_, event_type, char_index, char_length, error_message);
last_char_index_ = char_index;
}
TtsPlatformImplMac::TtsPlatformImplMac()
: speech_synthesizer_([[AVSpeechSynthesizer alloc] init]),
delegate_([[ChromeTtsDelegate alloc] initWithPlatformImplMac:this]) {
speech_synthesizer_.delegate = delegate_;
}
// static
TtsPlatformImplMac* TtsPlatformImplMac::GetInstance() {
static base::NoDestructor<TtsPlatformImplMac> tts_platform;
return tts_platform.get();
}
// static
std::vector<content::VoiceData>& TtsPlatformImplMac::VoicesRefForTesting() {
return VoicesRef();
}
@implementation ChromeTtsDelegate {
raw_ptr<TtsPlatformImplMac> _ttsImplMac; // weak.
}
- (id)initWithPlatformImplMac:(TtsPlatformImplMac*)ttsImplMac {
if ((self = [super init])) {
_ttsImplMac = ttsImplMac;
}
return self;
}
- (void)speechSynthesizer:(AVSpeechSynthesizer*)synthesizer
didStartSpeechUtterance:(AVSpeechUtterance*)utterance {
_ttsImplMac->OnSpeechEvent(GetUtteranceId(utterance),
content::TTS_EVENT_START, /*char_index=*/0,
kNoLength, kNoError);
}
- (void)speechSynthesizer:(AVSpeechSynthesizer*)synthesizer
didFinishSpeechUtterance:(AVSpeechUtterance*)utterance {
_ttsImplMac->OnSpeechEvent(GetUtteranceId(utterance), content::TTS_EVENT_END,
/*char_index=*/0, kNoLength, kNoError);
}
- (void)speechSynthesizer:(AVSpeechSynthesizer*)synthesizer
willSpeakRangeOfSpeechString:(NSRange)characterRange
utterance:(AVSpeechUtterance*)utterance {
// Ignore bogus ranges. The Mac speech synthesizer is a bit buggy and
// occasionally returns a number way out of range.
if (characterRange.location > utterance.speechString.length ||
characterRange.length == 0) {
return;
}
_ttsImplMac->OnSpeechEvent(GetUtteranceId(utterance), content::TTS_EVENT_WORD,
characterRange.location, characterRange.length,
kNoError);
}
@end