media / learning / common / learning_task.h [blame]

// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef MEDIA_LEARNING_COMMON_LEARNING_TASK_H_
#define MEDIA_LEARNING_COMMON_LEARNING_TASK_H_

#include <initializer_list>
#include <optional>
#include <string>
#include <vector>

#include "base/component_export.h"
#include "media/learning/common/value.h"

namespace media {
namespace learning {

// Description of a learning task.  This includes both the description of the
// inputs (features) and output (target value), plus a choice of the model and
// parameters for learning.
// TODO(liberato): Consider separating the task from the choice of model.
// TODO(liberato): should this be in impl?  Probably not if we want to allow
// registering tasks.
struct COMPONENT_EXPORT(LEARNING_COMMON) LearningTask {
  // Numeric ID for this task for UKM reporting.
  using Id = uint64_t;

  // Not all models support all feature / target descriptions.  For example,
  // NaiveBayes requires kUnordered features.  Similarly, LogLinear woudln't
  // support kUnordered features or targets.  kRandomForest might support more
  // combination of orderings and types.
  enum class Model {
    kExtraTrees,
    kLookupTable,

    // For the fuzzer.
    kMaxValue = kLookupTable
  };

  enum class Ordering {
    // Values are not ordered; nearby values might have wildly different
    // meanings.  For example, two ints that are computed by taking the hash
    // of a string are unordered; it's categorical data.  Values of type DOUBLE
    // should almost certainly not be kUnordered; discretize them in some way
    // if you really want to make discrete, unordered buckets out of them.
    kUnordered,

    // Values may be interpreted as being in numeric order.  For example, two
    // ints that represent the number of elapsed milliseconds are numerically
    // ordered in a meaningful way.
    kNumeric,

    // For the fuzzer.
    kMaxValue = kNumeric
  };

  enum class PrivacyMode {
    // Value represents private information, such as a URL that was visited by
    // the user.
    kPrivate,

    // Value does not represent private information, such as video width.
    kPublic,

    // For the fuzzer.
    kMaxValue = kPublic
  };

  // Description of how a Value should be interpreted.
  struct ValueDescription {
    // Name of this value, such as "source_url" or "width".
    std::string name;

    // Is this value nominal or not?
    Ordering ordering = Ordering::kUnordered;

    // Should this value be treated as being private?
    PrivacyMode privacy_mode = PrivacyMode::kPublic;
  };

  LearningTask();
  LearningTask(const std::string& name,
               Model model,
               std::initializer_list<ValueDescription> feature_init_list,
               ValueDescription target_description);
  LearningTask(const LearningTask&);
  ~LearningTask();

  // Return a stable, unique numeric ID for this task.  This requires a stable,
  // unique |name| for the task.  This is used to identify this task in UKM.
  Id GetId() const;

  // Returns a reference to an empty learning task.
  static const LearningTask& Empty();

  // Unique name for this task.
  std::string name;

  Model model = Model::kExtraTrees;

  std::vector<ValueDescription> feature_descriptions;

  // Note that kUnordered targets indicate classification, while kOrdered
  // targes indicate regression.
  ValueDescription target_description;

  // TODO(liberato): add training parameters, like smoothing constants.  It's
  // okay if some of these are model-specific.
  // TODO(liberato): switch to base::DictionaryValue?

  // Maximum data set size until we start replacing examples.
  size_t max_data_set_size = 100u;

  // Fraction of examples that must be new before the task controller will train
  // a new model.  Note that this is a fraction of the number of examples that
  // we currently have, which might be less than |max_data_set_size|.
  double min_new_data_fraction = 0.1;

  // If provided, then we'll randomly select a |*feature_subset_size|-sized set
  // of feature to train the model with, to allow for feature importance
  // measurement.  Note that UMA reporting only supports subsets of size one, or
  // the whole set.
  std::optional<int> feature_subset_size;

  // RandomForest parameters

  // Number of trees in the random forest.
  size_t rf_number_of_trees = 100;

  // Should ExtraTrees apply one-hot conversion automatically?  RandomTree has
  // been modified to support nominals directly, though it isn't exactly the
  // same as one-hot conversion.  It is, however, much faster.
  bool use_one_hot_conversion = false;

  // Reporting parameters

  // This is a hack for the initial media capabilities investigation. It
  // represents the threshold that we'll use to decide if a prediction would be
  // T / F.  We should not do this -- instead we should report the distribution
  // average for the prediction and the observation via UKM.
  //
  // In particular, if the percentage of dropped frames is greater than this,
  // then report "false" (not smooth), else we report true.
  //
  // A better, non-hacky approach would be to report the predictions and
  // observations directly, and do offline analysis with whatever threshold we
  // like.  This would remove the thresholding requirement, and also permit
  // additional types of analysis for general regression tasks, such measuring
  // the prediction error directly.
  //
  // The UKM reporter will support this.
  double smoothness_threshold = 0.1;

  // If set, then we'll record a confusion matrix (hackily, see
  // |smoothness_threshold|, above, for what that means) to UMA for all
  // predictions.  Add this task's name to histograms.xml, in the histogram
  // suffixes for "Media.Learning.BinaryThreshold.Aggregate".  The threshold is
  // chosen by |smoothness_threshold|.
  //
  // This option is ignored if feature subset selection is in use.
  bool uma_hacky_aggregate_confusion_matrix = false;

  // If set, then we'll record a histogram of many confusion matrices, split out
  // by the total training data weight that was used to construct the model.  Be
  // sure to add this task's name to histograms.xml, in the histogram suffixes
  // for "Media.Learning.BinaryThreshold.ByTrainingWeight".  The threshold is
  // chosen by |smoothness_threshold|.
  //
  // This option is ignored if feature subset selection is in use.
  bool uma_hacky_by_training_weight_confusion_matrix = false;

  // If set, then we'll record a histogram of many confusion matrices, split out
  // by the (single) selected feature subset.  This does nothing if we're not
  // using feature subsets, or if the subset size isn't one.  Be sure to add
  // this tasks' name to histograms.xml, in the histogram suffixes for
  // "Media.Learning.BinaryThreshold.ByFeature" too.
  bool uma_hacky_by_feature_subset_confusion_matrix = false;

  // Maximum training weight for UMA reporting.  We'll report results offset
  // into different confusion matrices in the same histogram, evenly spaced
  // from 0 to |max_reporting_weight|, with one additional bucket for everything
  // larger than that.  The number of buckets is |num_reporting_weight_buckets|.
  // The default value of 0 is special; it means that we should split up the
  // buckets such that the last bucket means "entirely full training set", while
  // the remainder are evenly spaced.  This is the same as setting it to
  // |max_data_set_size - 1|.  Of course, |max_data_set_size| is a number of
  // examples, not a weight, so this only makes any sense at all if all of the
  // examples have the default weight of 1.
  double max_reporting_weight = 0.;

  // Number of buckets that we'll use to split out the confusion matrix by
  // training weight.  The last one is reserved for "all", while the others are
  // split evenly from 0 to |max_reporting_weight|, inclusive.  One can select
  // up to 15 buckets.  We use 11 by default, so it breaks up the default weight
  // into buckets of size 10.
  //
  // In other words, the defaults will make these buckets:
  // [0-9] [10-19] ... [90-99] [100 and up].  This makes sense if the training
  // set maximum size is the default of 100, and each example has a weight of 1.
  int num_reporting_weight_buckets = 11;

  // If set, then we'll record results to UKM.  Note that this may require an
  // additional privacy review for your learning task!  Also note that it is
  // currently exclusive with |uma_hacky_confusion_matrix| for no technical
  // reason whatsoever.
  bool report_via_ukm = false;

  // When reporting via UKM, we will scale observed / predicted values.  These
  // are the minimum and maximum target / observed values that will be
  // representable.  The UKM record will scale / translate this range into
  // 0-100 integer, inclusive.  This is intended for regression targets.
  // Classification will do something else.
  double ukm_min_input_value = 0.0;
  double ukm_max_input_value = 1.0;
};

}  // namespace learning
}  // namespace media

#endif  // MEDIA_LEARNING_COMMON_LEARNING_TASK_H_