1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148

ash / public / cpp / tab_cluster / correlation_clusterer.h [blame]

// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef ASH_PUBLIC_CPP_TAB_CLUSTER_CORRELATION_CLUSTERER_H_
#define ASH_PUBLIC_CPP_TAB_CLUSTER_CORRELATION_CLUSTERER_H_

#include <map>
#include <optional>
#include <set>
#include <vector>

#include "ash/public/cpp/ash_public_export.h"
#include "ash/public/cpp/tab_cluster/undirected_graph.h"

namespace ash {

class UndirectedGraph;
class EdgeSum;

// Adapted from
// https://github.com/google-research/google-research/blob/HEAD/parallel_clustering/clustering/config.proto#L66
// Consider a graph with vertex set V, edge set E, non-negative vertex weights
// k_u, edge weights w_uv, and a "resolution" parameter which must be
// non-negative. We define "rescaled" edge weights w'_uv for all u, v, in V as:
//             { 0                                if u == v
// w'_{uv} =   {  w_uv  -  resolution k_u k_v     if {u, v} in E,
//             {  -resolution k_u k_v             otherwise
// The correlation clustering objective is to maximize
//   sum_{u, v in the same cluster} w'_uv,
// which is equivalent (up to sign and an additive constant) to the
// "maximizing agreements" and "minimizing disagreements" formulations of
// correlation clustering that are used in the approximation algorithms
// literature. Assuming the total edge weight in the graph is M, modularity
// partitioning can be expressed in this form by:
//  * setting resolution = 1/(2*M),
//  * setting the weight of each node to the total weight of its incident edges.
// Note that the final correlation clustering objective is monotonic in, but not
// equal to modularity. In particular, if the correlation clustering objective
// is C, we have:
// modularity = (C - resolution * sum_v (deg_v)^2 / (4 * M)) / M.
//
// To optimize this objective we use local search. We start with each vertex in
// its own cluster. We consider moves of the following form: move all vertices
// in a "move set" S of vertices to either one of the existing clusters or to a
// newly created cluster. We currently consider the following options for S:
//  - One move set per current cluster with all the vertices currently in it.
//    With these move sets we're effectively considering merging two clusters.
// For each move set considered we move that move set to the cluster that
// improves the objective the most if an improving move exists.
// TODO(crbug/1231303): //ash/public mostly contains interfaces, shared
// constants, types and utility classes, which are used to share code between
// //ash and //chrome. It's not common to put all implementation inside
// //ash/public. Find a better place to move this class to.
class ASH_PUBLIC_EXPORT CorrelationClusterer {
 public:
  struct CorrelationClustererConfig {
    double resolution = 1.0;
  };
  CorrelationClusterer();
  CorrelationClusterer(const CorrelationClusterer&) = delete;
  CorrelationClusterer& operator=(const CorrelationClusterer&) = delete;
  ~CorrelationClusterer();

  // Returns a list of clusters for a given undirected graph.
  std::vector<std::vector<int>> Cluster(
      const UndirectedGraph& undirected_graph);

 private:
  // The main clustering function.
  // initial_clustering must include every node in the range
  // [0, MutableGraph().NumNodes()) exactly once. If it doesn't this function
  // will log an error message and return clusters of singleton.
  void RefineClusters(std::vector<std::vector<int>>* clusters_ptr);
  // Fills `clustering_` with a map of node (index) to cluster id (value)
  // from the given set of `clusters`, returns false with non-empty
  // error string when a problem occurred.
  bool SetClustering(const std::vector<std::vector<int>>& clusters,
                     std::string* error);
  // Moves node from its current cluster to a new cluster.
  void MoveNodeToCluster(const int node, const int new_cluster);
  void MoveNodesToCluster(const std::set<int>& nodes,
                          std::optional<int> new_cluster);
  // Returns a pair of:
  //  * The best cluster to move `moving_nodes` to according to the correlation
  //    clustering objective function. Null optional means create a new cluster.
  //  * The change in objective function achieved by that move. May be positive
  //    or negative.
  // The runtime is linear in the number of edges incident to `moving_nodes`.
  std::pair<std::optional<int>, double> BestMove(
      const std::set<int>& moving_nodes);
  // Computes the best move given certain pre-computed sums of edge weights of
  // the following classes of vertices in relation to a fixed set of
  // `moving_nodes` that may change clusters:
  //  * Class 0: Neither node is moving.
  //  * Class 1: Exactly one node is moving.
  //  * Class 2: Both nodes are moving.
  // where "moving" means in moving_nodes.
  //
  // Change in objective if we move all `moving nodes` to cluster i:
  //   class_2_currently_separate + class_1_together_after[i] -
  //   class_1_currently_together
  // where
  //   class_2_currently_separate = Weight of edges in class 2 where the
  //   endpoints are in different clusters currently
  //   class_1_together_after[i] = Weight of edges in class 1 where the
  //   non-moving node is in cluster i
  //   class_1_currently_together = Weight of edges in class 1 where the
  //   endpoints are in the same cluster currently
  //
  // Two complications:
  //   * We need to avoid double-counting pairs in class 2
  //   * We need to account for missing edges, which have weight
  //     -resolution. To do so we subtract the number of edges we see in each
  //     category from the max possible number of edges (i.e. the number of
  //     edges we'd have if the graph was complete).
  std::pair<std::optional<int>, double> BestMoveFromStats(
      double moving_nodes_weight,
      std::map<int, double>& cluster_moving_weights,
      const EdgeSum& class_2_currently_separate,
      const EdgeSum& class_1_currently_together,
      const std::map<int, EdgeSum>& class_1_together_after);
  // Increments `next_cluster_id_`.
  int NewClusterId();
  // Gets the cluster of a given `node` from `clustering_`.
  int ClusterForNode(int node) const;
  // Gets cluster weight of a `cluster_id` from `cluster_weights_` map.
  double GetClusterWeight(int cluster_id) const;
  // Resets data members before performing clustering.
  void Reset();

  UndirectedGraph graph_;
  CorrelationClustererConfig config_;
  // Maps node to its cluster.
  std::vector<int> clustering_;
  // Number of nodes in each cluster.
  // Currently this is used only to detect when a cluster is empty so its
  // entry can be removed from cluster_sizes_ and cluster_weights_.
  std::map<int, int32_t> cluster_sizes_;
  // Sum of the weights of the nodes in each cluster.
  std::map<int, double> cluster_weights_;
  int next_cluster_id_ = 0;
  int num_nodes_ = 0;
};

}  // namespace ash

#endif  // ASH_PUBLIC_CPP_TAB_CLUSTER_CORRELATION_CLUSTERER_H_