1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
media / mojo / mojom / speech_recognition_result.h [blame]
// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef MEDIA_MOJO_MOJOM_SPEECH_RECOGNITION_RESULT_H_
#define MEDIA_MOJO_MOJOM_SPEECH_RECOGNITION_RESULT_H_
#include <optional>
#include <string>
#include <vector>
#include "base/time/time.h"
namespace media {
struct HypothesisParts {
HypothesisParts();
HypothesisParts(const std::vector<std::string> part, base::TimeDelta offset);
HypothesisParts(const HypothesisParts&);
HypothesisParts(HypothesisParts&&);
HypothesisParts& operator=(const HypothesisParts&);
HypothesisParts& operator=(HypothesisParts&&);
~HypothesisParts();
bool operator==(const HypothesisParts& rhs) const;
// A section of the final transcription text. Either an entire word or single
// character (depending on the language) with adjacent punctuation. There will
// usually only be one value here. If formatting is enabled in the speech
// recognition, then the raw text will be included as the second element.
std::vector<std::string> text;
// Time offset from this event's |audio_start_time| defined below. Time
// offset from this event's |audio_start_time| defined below. We enforce the
// following invariant: 0 <= hypothesis_part_offset < |audio_end_time -
// audio_start_time|.
base::TimeDelta hypothesis_part_offset;
};
struct TimingInformation {
TimingInformation();
TimingInformation(const TimingInformation&);
TimingInformation(TimingInformation&&);
TimingInformation& operator=(const TimingInformation&);
TimingInformation& operator=(TimingInformation&&);
~TimingInformation();
bool operator==(const TimingInformation& rhs) const;
// Start time in audio time from the start of the SODA session.
// This time measures the amount of audio input into SODA.
base::TimeDelta audio_start_time;
// Elapsed processed audio from first frame after preamble.
base::TimeDelta audio_end_time;
// The timing information for each word/letter in the transription.
// HypothesisPartsInResult was introduced in min version 1 in
// chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
// must be optional. Hypothesis parts maybe non-empty optional containing a
// zero length vector if no words were spoken during the event's time span.
std::optional<std::vector<HypothesisParts>> hypothesis_parts;
};
// A speech recognition result created by the speech service and passed to the
// SpeechRecognitionRecognizerClient.
struct SpeechRecognitionResult {
SpeechRecognitionResult();
SpeechRecognitionResult(const std::string transcript, bool is_final);
SpeechRecognitionResult(const SpeechRecognitionResult&);
SpeechRecognitionResult(SpeechRecognitionResult&&);
SpeechRecognitionResult& operator=(const SpeechRecognitionResult&);
SpeechRecognitionResult& operator=(SpeechRecognitionResult&&);
~SpeechRecognitionResult();
bool operator==(const SpeechRecognitionResult& rhs) const;
std::string transcription;
// A flag indicating whether the result is final. If true, the result is
// locked in and the next result returned will not overlap with the previous
// final result.
bool is_final = false;
// Timing information for the current transcription.
std::optional<TimingInformation> timing_information;
};
} // namespace media
#endif // MEDIA_MOJO_MOJOM_SPEECH_RECOGNITION_RESULT_H_