1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71

content / renderer / accessibility / annotations / ax_image_stopwords.h [blame]

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CONTENT_RENDERER_ACCESSIBILITY_ANNOTATIONS_AX_IMAGE_STOPWORDS_H_
#define CONTENT_RENDERER_ACCESSIBILITY_ANNOTATIONS_AX_IMAGE_STOPWORDS_H_

#include <string_view>

#include "base/containers/flat_set.h"
#include "base/no_destructor.h"
#include "content/common/content_export.h"

namespace content {

// Maintains a set of image stopwords and provides a function to check
// whether or not a given word is an image stopword.
//
// A stopword in general is a word that's filtered out before doing
// natural language processing. In English, common stopwords include
// "the" or "of" - they are words that are part of grammatically correct
// sentences but don't add any useful semantics themselves.
//
// This set is used as part of an algorithm to determine whether the
// accessible label for an image (including the "alt" attribute and
// other attributes) contains a useful description or not. For this
// application, both common stopwords like "the", but also image-related
// words like "image" and "photo" are included, because an image that's
// just labeled with the word "photo" is essentially unlabeled.
//
// Stopwords from all supported languages are grouped together, because
// it's simpler to just have one set rather than to try to split by the
// element language (which is sometimes wrong). This leads to a small
// but acceptable number of false positives if a stopword in one language
// is a meaningful word in another language.
//
// The set of supported languages should include all of the languages
// that we can generate automatic image descriptions for. This will grow
// over time.
//
// Words consisting of just one or two characters made up of letters from
// Latin alphabets are always considered stopwords, but that doesn't
// generalize to all languages / character sets.
//
// The set of stopwords was obtained by extracting the alt text of images
// from billions of web pages, tokenizing, counting, and then manually
// categorizing the top words, with the help of dictionaries and language
// experts. More details in this (Google-internal) design doc:
// http://goto.google.com/augment-existing-image-descriptions
class CONTENT_EXPORT AXImageStopwords {
 public:
  static AXImageStopwords& GetInstance();

  // The input should be a word, after already splitting by punctuation and
  // whitespace. Returns true if the word is an image stopword.
  // Case-insensitive and language-neutral (includes words from all
  // languages).
  bool IsImageStopword(const char* utf8_string) const;

 private:
  friend base::NoDestructor<AXImageStopwords>;

  AXImageStopwords();
  ~AXImageStopwords();

  base::flat_set<std::string_view> stopword_set_;
};

}  // namespace content

#endif  // CONTENT_RENDERER_ACCESSIBILITY_ANNOTATIONS_AX_IMAGE_STOPWORDS_H_