1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91

media / mojo / mojom / speech_recognition_result.h [blame]

// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef MEDIA_MOJO_MOJOM_SPEECH_RECOGNITION_RESULT_H_
#define MEDIA_MOJO_MOJOM_SPEECH_RECOGNITION_RESULT_H_

#include <optional>
#include <string>
#include <vector>

#include "base/time/time.h"

namespace media {

struct HypothesisParts {
  HypothesisParts();
  HypothesisParts(const std::vector<std::string> part, base::TimeDelta offset);
  HypothesisParts(const HypothesisParts&);
  HypothesisParts(HypothesisParts&&);
  HypothesisParts& operator=(const HypothesisParts&);
  HypothesisParts& operator=(HypothesisParts&&);
  ~HypothesisParts();

  bool operator==(const HypothesisParts& rhs) const;

  // A section of the final transcription text. Either an entire word or single
  // character (depending on the language) with adjacent punctuation. There will
  // usually only be one value here. If formatting is enabled in the speech
  // recognition, then the raw text will be included as the second element.
  std::vector<std::string> text;

  // Time offset from this event's |audio_start_time| defined below. Time
  // offset from this event's |audio_start_time| defined below. We enforce the
  // following invariant: 0 <= hypothesis_part_offset < |audio_end_time -
  // audio_start_time|.
  base::TimeDelta hypothesis_part_offset;
};

struct TimingInformation {
  TimingInformation();
  TimingInformation(const TimingInformation&);
  TimingInformation(TimingInformation&&);
  TimingInformation& operator=(const TimingInformation&);
  TimingInformation& operator=(TimingInformation&&);
  ~TimingInformation();

  bool operator==(const TimingInformation& rhs) const;

  // Start time in audio time from the start of the SODA session.
  // This time measures the amount of audio input into SODA.
  base::TimeDelta audio_start_time;

  // Elapsed processed audio from first frame after preamble.
  base::TimeDelta audio_end_time;

  // The timing information for each word/letter in the transription.
  // HypothesisPartsInResult was introduced in min version 1 in
  // chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
  // must be optional. Hypothesis parts maybe non-empty optional containing a
  // zero length vector if no words were spoken during the event's time span.
  std::optional<std::vector<HypothesisParts>> hypothesis_parts;
};

// A speech recognition result created by the speech service and passed to the
// SpeechRecognitionRecognizerClient.
struct SpeechRecognitionResult {
  SpeechRecognitionResult();
  SpeechRecognitionResult(const std::string transcript, bool is_final);
  SpeechRecognitionResult(const SpeechRecognitionResult&);
  SpeechRecognitionResult(SpeechRecognitionResult&&);
  SpeechRecognitionResult& operator=(const SpeechRecognitionResult&);
  SpeechRecognitionResult& operator=(SpeechRecognitionResult&&);
  ~SpeechRecognitionResult();

  bool operator==(const SpeechRecognitionResult& rhs) const;

  std::string transcription;

  // A flag indicating whether the result is final. If true, the result is
  // locked in and the next result returned will not overlap with the previous
  // final result.
  bool is_final = false;

  // Timing information for the current transcription.
  std::optional<TimingInformation> timing_information;
};

}  // namespace media

#endif  // MEDIA_MOJO_MOJOM_SPEECH_RECOGNITION_RESULT_H_