1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221
  222
  223
  224
  225
  226
  227
  228
  229

media / learning / impl / learning_task_controller_impl.cc [blame]

// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "media/learning/impl/learning_task_controller_impl.h"

#include <memory>
#include <utility>
#include <vector>

#include "base/check_op.h"
#include "base/functional/bind.h"
#include "base/notreached.h"
#include "media/learning/impl/distribution_reporter.h"
#include "media/learning/impl/extra_trees_trainer.h"
#include "media/learning/impl/lookup_table_trainer.h"

namespace media {
namespace learning {

LearningTaskControllerImpl::LearningTaskControllerImpl(
    const LearningTask& task,
    std::unique_ptr<DistributionReporter> reporter,
    SequenceBoundFeatureProvider feature_provider)
    : task_(task),
      training_data_(std::make_unique<TrainingData>()),
      reporter_(std::move(reporter)),
      expected_feature_count_(task_.feature_descriptions.size()) {
  // Note that |helper_| uses the full set of features.
  helper_ = std::make_unique<LearningTaskControllerHelper>(
      task,
      base::BindRepeating(&LearningTaskControllerImpl::AddFinishedExample,
                          weak_ptr_factory_.GetWeakPtr()),
      std::move(feature_provider));

  // TODO(liberato): Make this compositional.  FeatureSubsetTaskController?
  if (task_.feature_subset_size)
    DoFeatureSubsetSelection();

  switch (task_.model) {
    case LearningTask::Model::kExtraTrees:
      trainer_ = std::make_unique<ExtraTreesTrainer>();
      break;
    case LearningTask::Model::kLookupTable:
      trainer_ = std::make_unique<LookupTableTrainer>();
      break;
  }
}

LearningTaskControllerImpl::~LearningTaskControllerImpl() = default;

void LearningTaskControllerImpl::BeginObservation(
    base::UnguessableToken id,
    const FeatureVector& features,
    const std::optional<TargetValue>& default_target,
    const std::optional<ukm::SourceId>& source_id) {
  // TODO(liberato): Should we enforce that the right number of features are
  // present here?  Right now, we allow it to be shorter, so that features from
  // a FeatureProvider may be omitted.  Of course, they have to be at the end in
  // that case.  If we start enforcing it here, make sure that LearningHelper
  // starts adding the placeholder features.
  if (!trainer_)
    return;

  // We don't support default targets, since we're the base learner and can't
  // easily do that.  However, defaults are handled by (weak) controllers
  // handed out by LearningSessionImpl.  So, we don't bother since they never
  // get here anyway.
  DCHECK(!default_target);

  helper_->BeginObservation(id, features, source_id);
}

void LearningTaskControllerImpl::CompleteObservation(
    base::UnguessableToken id,
    const ObservationCompletion& completion) {
  if (!trainer_)
    return;
  helper_->CompleteObservation(id, completion);
}

void LearningTaskControllerImpl::CancelObservation(base::UnguessableToken id) {
  if (!trainer_)
    return;
  helper_->CancelObservation(id);
}

void LearningTaskControllerImpl::UpdateDefaultTarget(
    base::UnguessableToken id,
    const std::optional<TargetValue>& default_target) {
  NOTREACHED();
}

const LearningTask& LearningTaskControllerImpl::GetLearningTask() {
  return task_;
}

void LearningTaskControllerImpl::PredictDistribution(
    const FeatureVector& features,
    PredictionCB callback) {
  if (model_)
    std::move(callback).Run(model_->PredictDistribution(features));
  else
    std::move(callback).Run(std::nullopt);
}

void LearningTaskControllerImpl::AddFinishedExample(LabelledExample example,
                                                    ukm::SourceId source_id) {
  // Verify that we have a trainer and that we got the right number of features.
  // We don't compare to |task_.feature_descriptions.size()| since that has been
  // adjusted to the subset size already.  We expect the original count.
  if (!trainer_ || example.features.size() != expected_feature_count_)
    return;

  // Now that we have the whole set of features, select the subset we want.
  FeatureVector new_features;
  if (task_.feature_subset_size) {
    for (auto& iter : feature_indices_)
      new_features.push_back(example.features[iter]);
    example.features = std::move(new_features);
  }  // else use them all.

  // The features should now match the task.
  DCHECK_EQ(example.features.size(), task_.feature_descriptions.size());

  if (training_data_->size() >= task_.max_data_set_size) {
    // Replace a random example.  We don't necessarily want to replace the
    // oldest, since we don't necessarily want to enforce an ad-hoc recency
    // constraint here.  That's a different issue.
    (*training_data_)[rng()->Generate(training_data_->size())] = example;
  } else {
    training_data_->push_back(example);
  }
  // Either way, we have one more example that we haven't used for training yet.
  num_untrained_examples_++;

  // Once we have a model, see if we'd get |example| correct.
  if (model_ && reporter_) {
    TargetHistogram predicted = model_->PredictDistribution(example.features);

    DistributionReporter::PredictionInfo info;
    info.observed = example.target_value;
    info.source_id = source_id;
    info.total_training_weight = last_training_weight_;
    info.total_training_examples = last_training_size_;
    reporter_->GetPredictionCallback(info).Run(predicted);
  }

  // Can't train more than one model concurrently.
  if (training_is_in_progress_)
    return;

  // Train every time we get enough new examples.  Note that this works even if
  // we are replacing old examples rather than adding new ones.
  double frac = ((double)num_untrained_examples_) / training_data_->size();
  if (frac < task_.min_new_data_fraction)
    return;

  num_untrained_examples_ = 0;

  // Record these for metrics.
  last_training_weight_ = training_data_->total_weight();
  last_training_size_ = training_data_->size();

  TrainedModelCB model_cb =
      base::BindOnce(&LearningTaskControllerImpl::OnModelTrained,
                     weak_ptr_factory_.GetWeakPtr(),
                     training_data_->total_weight(), training_data_->size());
  training_is_in_progress_ = true;
  // Note that this copies the training data, so it's okay if we add more
  // examples to our copy before this returns.
  // TODO(liberato): Post to a background task runner, and bind |model_cb| to
  // the current one.  Be careful about ownership if we invalidate |trainer_|
  // on this thread.  Be sure to post destruction to that sequence.
  trainer_->Train(task_, *training_data_, std::move(model_cb));
}

void LearningTaskControllerImpl::OnModelTrained(double training_weight,
                                                int training_size,
                                                std::unique_ptr<Model> model) {
  DCHECK(training_is_in_progress_);
  training_is_in_progress_ = false;
  model_ = std::move(model);
  // Record these for metrics.
  last_training_weight_ = training_weight;
  last_training_size_ = training_size;
}

void LearningTaskControllerImpl::SetTrainerForTesting(
    std::unique_ptr<TrainingAlgorithm> trainer) {
  trainer_ = std::move(trainer);
}

void LearningTaskControllerImpl::DoFeatureSubsetSelection() {
  // Choose a random feature, and trim the descriptions to match.
  std::vector<size_t> features;
  for (size_t i = 0; i < task_.feature_descriptions.size(); i++)
    features.push_back(i);

  for (int i = 0; i < *task_.feature_subset_size; i++) {
    // Pick an element from |i| to the end of the list, inclusive.
    // TODO(liberato): For tests, this will happen before any rng is provided
    // by the test; we'll use an actual rng.
    int r = rng()->Generate(features.size() - i) + i;
    // Swap them.
    std::swap(features[i], features[r]);
  }

  // Construct the feature subset from the first few elements.  Also adjust the
  // task's descriptions to match.  We do this in two steps so that the
  // descriptions are added via iterating over |feature_indices_|, so that the
  // enumeration order is the same as when we adjust the feature values of
  // incoming examples.  In both cases, we iterate over |feature_indicies_|,
  // which might (will) re-order them with respect to |features|.
  for (int i = 0; i < *task_.feature_subset_size; i++)
    feature_indices_.insert(features[i]);

  std::vector<LearningTask::ValueDescription> adjusted_descriptions;
  for (auto& iter : feature_indices_)
    adjusted_descriptions.push_back(task_.feature_descriptions[iter]);

  task_.feature_descriptions = adjusted_descriptions;

  if (reporter_)
    reporter_->SetFeatureSubset(feature_indices_);
}

}  // namespace learning
}  // namespace media