1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221

media / learning / common / learning_task.h [blame]

// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef MEDIA_LEARNING_COMMON_LEARNING_TASK_H_
#define MEDIA_LEARNING_COMMON_LEARNING_TASK_H_

#include <initializer_list>
#include <optional>
#include <string>
#include <vector>

#include "base/component_export.h"
#include "media/learning/common/value.h"

namespace media {
namespace learning {

// Description of a learning task.  This includes both the description of the
// inputs (features) and output (target value), plus a choice of the model and
// parameters for learning.
// TODO(liberato): Consider separating the task from the choice of model.
// TODO(liberato): should this be in impl?  Probably not if we want to allow
// registering tasks.
struct COMPONENT_EXPORT(LEARNING_COMMON) LearningTask {
  // Numeric ID for this task for UKM reporting.
  using Id = uint64_t;

  // Not all models support all feature / target descriptions.  For example,
  // NaiveBayes requires kUnordered features.  Similarly, LogLinear woudln't
  // support kUnordered features or targets.  kRandomForest might support more
  // combination of orderings and types.
  enum class Model {
    kExtraTrees,
    kLookupTable,

    // For the fuzzer.
    kMaxValue = kLookupTable
  };

  enum class Ordering {
    // Values are not ordered; nearby values might have wildly different
    // meanings.  For example, two ints that are computed by taking the hash
    // of a string are unordered; it's categorical data.  Values of type DOUBLE
    // should almost certainly not be kUnordered; discretize them in some way
    // if you really want to make discrete, unordered buckets out of them.
    kUnordered,

    // Values may be interpreted as being in numeric order.  For example, two
    // ints that represent the number of elapsed milliseconds are numerically
    // ordered in a meaningful way.
    kNumeric,

    // For the fuzzer.
    kMaxValue = kNumeric
  };

  enum class PrivacyMode {
    // Value represents private information, such as a URL that was visited by
    // the user.
    kPrivate,

    // Value does not represent private information, such as video width.
    kPublic,

    // For the fuzzer.
    kMaxValue = kPublic
  };

  // Description of how a Value should be interpreted.
  struct ValueDescription {
    // Name of this value, such as "source_url" or "width".
    std::string name;

    // Is this value nominal or not?
    Ordering ordering = Ordering::kUnordered;

    // Should this value be treated as being private?
    PrivacyMode privacy_mode = PrivacyMode::kPublic;
  };

  LearningTask();
  LearningTask(const std::string& name,
               Model model,
               std::initializer_list<ValueDescription> feature_init_list,
               ValueDescription target_description);
  LearningTask(const LearningTask&);
  ~LearningTask();

  // Return a stable, unique numeric ID for this task.  This requires a stable,
  // unique |name| for the task.  This is used to identify this task in UKM.
  Id GetId() const;

  // Returns a reference to an empty learning task.
  static const LearningTask& Empty();

  // Unique name for this task.
  std::string name;

  Model model = Model::kExtraTrees;

  std::vector<ValueDescription> feature_descriptions;

  // Note that kUnordered targets indicate classification, while kOrdered
  // targes indicate regression.
  ValueDescription target_description;

  // TODO(liberato): add training parameters, like smoothing constants.  It's
  // okay if some of these are model-specific.
  // TODO(liberato): switch to base::DictionaryValue?

  // Maximum data set size until we start replacing examples.
  size_t max_data_set_size = 100u;

  // Fraction of examples that must be new before the task controller will train
  // a new model.  Note that this is a fraction of the number of examples that
  // we currently have, which might be less than |max_data_set_size|.
  double min_new_data_fraction = 0.1;

  // If provided, then we'll randomly select a |*feature_subset_size|-sized set
  // of feature to train the model with, to allow for feature importance
  // measurement.  Note that UMA reporting only supports subsets of size one, or
  // the whole set.
  std::optional<int> feature_subset_size;

  // RandomForest parameters

  // Number of trees in the random forest.
  size_t rf_number_of_trees = 100;

  // Should ExtraTrees apply one-hot conversion automatically?  RandomTree has
  // been modified to support nominals directly, though it isn't exactly the
  // same as one-hot conversion.  It is, however, much faster.
  bool use_one_hot_conversion = false;

  // Reporting parameters

  // This is a hack for the initial media capabilities investigation. It
  // represents the threshold that we'll use to decide if a prediction would be
  // T / F.  We should not do this -- instead we should report the distribution
  // average for the prediction and the observation via UKM.
  //
  // In particular, if the percentage of dropped frames is greater than this,
  // then report "false" (not smooth), else we report true.
  //
  // A better, non-hacky approach would be to report the predictions and
  // observations directly, and do offline analysis with whatever threshold we
  // like.  This would remove the thresholding requirement, and also permit
  // additional types of analysis for general regression tasks, such measuring
  // the prediction error directly.
  //
  // The UKM reporter will support this.
  double smoothness_threshold = 0.1;

  // If set, then we'll record a confusion matrix (hackily, see
  // |smoothness_threshold|, above, for what that means) to UMA for all
  // predictions.  Add this task's name to histograms.xml, in the histogram
  // suffixes for "Media.Learning.BinaryThreshold.Aggregate".  The threshold is
  // chosen by |smoothness_threshold|.
  //
  // This option is ignored if feature subset selection is in use.
  bool uma_hacky_aggregate_confusion_matrix = false;

  // If set, then we'll record a histogram of many confusion matrices, split out
  // by the total training data weight that was used to construct the model.  Be
  // sure to add this task's name to histograms.xml, in the histogram suffixes
  // for "Media.Learning.BinaryThreshold.ByTrainingWeight".  The threshold is
  // chosen by |smoothness_threshold|.
  //
  // This option is ignored if feature subset selection is in use.
  bool uma_hacky_by_training_weight_confusion_matrix = false;

  // If set, then we'll record a histogram of many confusion matrices, split out
  // by the (single) selected feature subset.  This does nothing if we're not
  // using feature subsets, or if the subset size isn't one.  Be sure to add
  // this tasks' name to histograms.xml, in the histogram suffixes for
  // "Media.Learning.BinaryThreshold.ByFeature" too.
  bool uma_hacky_by_feature_subset_confusion_matrix = false;

  // Maximum training weight for UMA reporting.  We'll report results offset
  // into different confusion matrices in the same histogram, evenly spaced
  // from 0 to |max_reporting_weight|, with one additional bucket for everything
  // larger than that.  The number of buckets is |num_reporting_weight_buckets|.
  // The default value of 0 is special; it means that we should split up the
  // buckets such that the last bucket means "entirely full training set", while
  // the remainder are evenly spaced.  This is the same as setting it to
  // |max_data_set_size - 1|.  Of course, |max_data_set_size| is a number of
  // examples, not a weight, so this only makes any sense at all if all of the
  // examples have the default weight of 1.
  double max_reporting_weight = 0.;

  // Number of buckets that we'll use to split out the confusion matrix by
  // training weight.  The last one is reserved for "all", while the others are
  // split evenly from 0 to |max_reporting_weight|, inclusive.  One can select
  // up to 15 buckets.  We use 11 by default, so it breaks up the default weight
  // into buckets of size 10.
  //
  // In other words, the defaults will make these buckets:
  // [0-9] [10-19] ... [90-99] [100 and up].  This makes sense if the training
  // set maximum size is the default of 100, and each example has a weight of 1.
  int num_reporting_weight_buckets = 11;

  // If set, then we'll record results to UKM.  Note that this may require an
  // additional privacy review for your learning task!  Also note that it is
  // currently exclusive with |uma_hacky_confusion_matrix| for no technical
  // reason whatsoever.
  bool report_via_ukm = false;

  // When reporting via UKM, we will scale observed / predicted values.  These
  // are the minimum and maximum target / observed values that will be
  // representable.  The UKM record will scale / translate this range into
  // 0-100 integer, inclusive.  This is intended for regression targets.
  // Classification will do something else.
  double ukm_min_input_value = 0.0;
  double ukm_max_input_value = 1.0;
};

}  // namespace learning
}  // namespace media

#endif  // MEDIA_LEARNING_COMMON_LEARNING_TASK_H_