1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221
  222
  223
  224
  225
  226
  227
  228
  229
  230
  231
  232
  233
  234
  235
  236
  237
  238
  239
  240
  241
  242
  243
  244
  245
  246
  247
  248
  249
  250
  251
  252
  253
  254
  255
  256
  257
  258
  259
  260
  261
  262
  263
  264
  265
  266
  267
  268
  269
  270
  271
  272
  273
  274
  275
  276
  277
  278
  279
  280
  281
  282
  283
  284
  285
  286
  287
  288
  289
  290
  291
  292
  293
  294
  295
  296
  297
  298
  299
  300
  301
  302
  303
  304
  305
  306
  307
  308
  309
  310
  311
  312
  313
  314
  315
  316
  317
  318
  319
  320
  321
  322
  323
  324
  325
  326
  327
  328
  329
  330
  331
  332
  333
  334
  335
  336
  337
  338
  339
  340
  341
  342
  343
  344
  345
  346
  347
  348
  349
  350
  351
  352
  353
  354
  355
  356
  357
  358
  359
  360
  361
  362
  363
  364
  365
  366
  367
  368
  369
  370
  371
  372
  373
  374
  375
  376
  377
  378
  379
  380
  381
  382
  383
  384
  385
  386
  387
  388
  389
  390
  391
  392
  393
  394
  395
  396
  397
  398
  399
  400
  401
  402
  403
  404
  405
  406
  407
  408
  409
  410
  411
  412
  413
  414
  415
  416
  417
  418
  419
  420
  421
  422
  423
  424
  425
  426
  427
  428
  429
  430
  431
  432
  433
  434
  435
  436
  437
  438
  439
  440
  441
  442
  443
  444
  445
  446
  447
  448
  449
  450
  451
  452
  453
  454
  455
  456
  457
  458
  459
  460
  461
  462
  463
  464
  465
  466
  467
  468
  469
  470
  471
  472
  473
  474
  475
  476
  477
  478
  479
  480
  481
  482
  483
  484
  485
  486
  487
  488
  489
  490

tools / add_header.py [blame]

#!/usr/bin/env python3
# Copyright 2021 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Helper for adding or removing an include to/from source file(s).

clang-format already provides header sorting functionality; however, the
functionality is limited to sorting headers within a block of headers surrounded
by blank lines (these are a heuristic to avoid clang breaking ordering for
headers sensitive to inclusion order, e.g. <windows.h>).

As a result, inserting a new header is a bit more complex than simply inserting
the new header at the top and running clang-format.

This script implements additional logic to:
- classify different blocks of headers by type (C system, C++ system, user)
- find the appropriate insertion point for the new header
- creating a new header block if necessary

As a bonus, it does *also* sort the includes, though any sorting disagreements
with clang-format should be resolved in favor of clang-format.

It also supports removing a header with option `--remove`.

Usage:
tools/add_header.py --header '<utility>' foo/bar.cc foo/baz.cc foo/baz.h
tools/add_header.py --header '<vector>' --remove foo/bar.cc foo/baz.cc foo/baz.h
"""

import argparse
import difflib
import os.path
import re
import sys

# The specific values of these constants are also used as a sort key for
# ordering different header types in the correct relative order.
_HEADER_TYPE_C_SYSTEM = 0
_HEADER_TYPE_CXX_SYSTEM = 1
_HEADER_TYPE_USER = 2
_HEADER_TYPE_INVALID = -1


def ClassifyHeader(decorated_name):
  if IsCSystemHeader(decorated_name):
    return _HEADER_TYPE_C_SYSTEM
  elif IsCXXSystemHeader(decorated_name):
    return _HEADER_TYPE_CXX_SYSTEM
  elif IsUserHeader(decorated_name):
    return _HEADER_TYPE_USER
  else:
    return _HEADER_TYPE_INVALID


def UndecoratedName(decorated_name):
  """Returns the undecorated version of decorated_name by removing "" or <>."""
  assert IsSystemHeader(decorated_name) or IsUserHeader(decorated_name)
  return decorated_name[1:-1]


def IsSystemHeader(decorated_name):
  """Returns true if decorated_name looks like a system header."""
  return decorated_name[0] == '<' and decorated_name[-1] == '>'


def IsCSystemHeader(decorated_name):
  """Returns true if decoraed_name looks like a C system header."""
  return IsSystemHeader(decorated_name) and UndecoratedName(
      decorated_name).endswith('.h')


def IsCXXSystemHeader(decorated_name):
  """Returns true if decoraed_name looks like a C++ system header."""
  return IsSystemHeader(
      decorated_name) and not UndecoratedName(decorated_name).endswith('.h')


def IsUserHeader(decorated_name):
  """Returns true if decoraed_name looks like a user header."""
  return decorated_name[0] == '"' and decorated_name[-1] == '"'


_EMPTY_LINE_RE = re.compile(r'\s*$')
_COMMENT_RE = re.compile(r'\s*//(.*)$')
_INCLUDE_RE = re.compile(
    r'\s*#(import|include)\s+([<"].+?[">])\s*?(?://(.*))?$')


def FindIncludes(lines):
  """Finds the block of #includes, assuming Google+Chrome C++ style source.

  Note that this doesn't simply return a slice of the input lines, because
  having the actual indices simplifies things when generatingn the updated
  source text.

  Args:
    lines: The source text split into lines.

  Returns:
    A tuple of begin, end indices that can be used to slice the input lines to
        contain the includes to process. Returns -1, -1 if no such block of
        input lines could be found.
  """
  begin = end = -1
  for idx, line in enumerate(lines):
    # Skip over any initial comments (e.g. the copyright boilerplate) or empty
    # lines.
    # TODO(dcheng): This means that any preamble comment associated with the
    # first header will be dropped. So far, this hasn't broken anything, but
    # maybe this needs to be more clever.
    # TODO(dcheng): #define and #undef should probably also be allowed.
    if _EMPTY_LINE_RE.match(line) or _COMMENT_RE.match(line):
      continue
    m = _INCLUDE_RE.match(line)
    if not m:
      if begin < 0:
        # No match, but no #includes have been seen yet. Keep scanning for the
        # first #include.
        continue
      # Give up, it's something weird that probably requires manual
      # intervention.
      break

    if begin < 0:
      begin = idx
    end = idx + 1
  return begin, end


class Include(object):
  """Represents an #include/#import and any interesting metadata for it.

  Attributes:
    decorated_name: The name of the header file, decorated with <> for system
      headers or "" for user headers.

    directive: 'include' or 'import'
      TODO(dcheng): In the future, this may need to support C++ modules.

    preamble: Any comment lines that precede this include line, e.g.:

        // This is a preamble comment
        // for a header file.
        #include <windows.h>

      would have a preamble of

        ['// This is a preamble comment', '// for a header file.'].

    inline_comment: Any comment that comes after the #include on the same line,
      e.g.

        #include <windows.h>  // For CreateWindowExW()

      would be parsed with an inline comment of ' For CreateWindowExW'.

    header_type: The header type corresponding to decorated_name as determined
      by ClassifyHeader().

    is_primary_header: True if this is the primary related header of a C++
      implementation file. Any primary header will be sorted to the top in its
      own separate block.
  """

  def __init__(self, decorated_name, directive, preamble, inline_comment):
    self.decorated_name = decorated_name
    assert directive == 'include' or directive == 'import'
    self.directive = directive
    self.preamble = preamble
    self.inline_comment = inline_comment
    self.header_type = ClassifyHeader(decorated_name)
    assert self.header_type != _HEADER_TYPE_INVALID
    self.is_primary_header = False

  def __repr__(self):
    return str((self.decorated_name, self.directive, self.preamble,
                self.inline_comment, self.header_type, self.is_primary_header))

  def ShouldInsertNewline(self, previous_include):
    # Per the Google C++ style guide, different blocks of headers should be
    # separated by an empty line.
    return (self.is_primary_header != previous_include.is_primary_header
            or self.header_type != previous_include.header_type)

  def ToSource(self):
    """Generates a C++ source representation of this include."""
    source = []
    source.extend(self.preamble)
    include_line = '#%s %s' % (self.directive, self.decorated_name)
    if self.inline_comment:
      include_line = include_line + '  //' + self.inline_comment
    source.append(include_line)
    return [line.rstrip() for line in source]


def ParseIncludes(lines):
  """Parses lines into a list of Include objects. Returns None on failure.

  Args:
    lines: A list of strings representing C++ source text.

  Returns:
    A list of Include objects representing the parsed input lines, or None if
    the input lines could not be parsed.
  """
  includes = []
  preamble = []
  for line in lines:
    if _EMPTY_LINE_RE.match(line):
      if preamble:
        # preamble contents are flushed when an #include directive is matched.
        # If preamble is non-empty, that means there is a preamble separated
        # from its #include directive by at least one newline. Just give up,
        # since the sorter has no idea how to preserve structure in this case.
        return None
      continue
    m = _INCLUDE_RE.match(line)
    if not m:
      preamble.append(line)
      continue
    includes.append(Include(m.group(2), m.group(1), preamble, m.group(3)))
    preamble = []
  # In theory, the caller should never pass a list of lines with a dangling
  # preamble. But there's a test case that exercises this, and just in case it
  # actually happens, fail a bit more gracefully.
  if preamble:
    return None
  return includes


def _DecomposePath(filename):
  """Decomposes a filename into a list of directories and the basename.

  Args:
    filename: A filename!

  Returns:
    A tuple of a list of directories and a string basename.
  """
  dirs = []
  dirname, basename = os.path.split(filename)
  while dirname:
    dirname, last = os.path.split(dirname)
    dirs.append(last)
  dirs.reverse()
  # Remove the extension from the basename.
  basename = os.path.splitext(basename)[0]
  return dirs, basename


_PLATFORM_SUFFIX = (
    r'(?:_(?:android|aura|chromeos|fuchsia|ios|linux|mac|ozone|posix|win|x11))?'
)
_TEST_SUFFIX = r'(?:_(?:browser|interactive_ui|perf|ui|unit)?test)?'


def MarkPrimaryInclude(includes, filename):
  """Finds the primary header in includes and marks it as such.

  Per the style guide, if moo.cc's main purpose is to implement or test the
  functionality in moo.h, moo.h should be ordered first in the includes.

  Args:
    includes: A list of Include objects.
    filename: The filename to use as the basis for finding the primary header.
  """
  # Header files never have a primary include.
  if filename.endswith('.h'):
    return

  # First pass. Looking for exact match primary header.
  exact_match_primary_header = f'{os.path.splitext(filename)[0]}.h'
  for include in includes:
    if IsUserHeader(include.decorated_name) and UndecoratedName(
        include.decorated_name) == exact_match_primary_header:
      include.is_primary_header = True
      return

  basis = _DecomposePath(filename)

  # Second pass. The list of includes is searched in reverse order of length.
  # Even though matching is fuzzy, moo_posix.h should take precedence over moo.h
  # when considering moo_posix.cc.
  includes.sort(key=lambda i: -len(i.decorated_name))
  for include in includes:
    if include.header_type != _HEADER_TYPE_USER:
      continue
    to_test = _DecomposePath(UndecoratedName(include.decorated_name))

    # If the basename to test is longer than the basis, just skip it and
    # continue. moo.c should never match against moo_posix.h.
    if len(to_test[1]) > len(basis[1]):
      continue

    # The basename in the two paths being compared need to fuzzily match.
    # This allows for situations where moo_posix.cc implements the interfaces
    # defined in moo.h.
    escaped_basename = re.escape(to_test[1])
    if not (re.match(escaped_basename + _PLATFORM_SUFFIX + _TEST_SUFFIX + '$',
                     basis[1]) or
            re.match(escaped_basename + _TEST_SUFFIX + _PLATFORM_SUFFIX + '$',
                     basis[1])):
      continue

    # The topmost directory name must match, and the rest of the directory path
    # should be 'substantially similar'.
    s = difflib.SequenceMatcher(None, to_test[0], basis[0])
    first_matched = False
    total_matched = 0
    for match in s.get_matching_blocks():
      if total_matched == 0 and match.a == 0 and match.b == 0:
        first_matched = True
      total_matched += match.size

    if not first_matched:
      continue

    # 'Substantially similar' is defined to be:
    # - no more than two differences
    # - at least one match besides the topmost directory
    total_differences = abs(total_matched -
                            len(to_test[0])) + abs(total_matched -
                                                   len(basis[0]))
    # Note: total_differences != 0 is mainly intended to allow more succinct
    # tests (otherwise tests with just a basename would always trip the
    # total_matched < 2 check).
    if total_differences != 0 and (total_differences > 2 or total_matched < 2):
      continue

    include.is_primary_header = True
    return


def SerializeIncludes(includes):
  """Turns includes back into the corresponding C++ source text.

  Args:
    includes: a list of Include objects.

  Returns:
    A list of strings representing C++ source text.
  """
  source = []

  # LINT.IfChange(winheader)
  # Headers that are sorted above others to prevent inclusion order issues.
  # NOTE: The order of these headers is the sort key and will be the order in
  # the output file. It should be set to match whatever clang-format will do.
  special_headers = [
      # Listed first because it must be before initguid.h in the block below.
      '<objbase.h>',

      # Alphabetized block that don't matter relative to each other, but need to
      # be included before any instance of the listed other header. These other
      # listed headers are non-exhaustive examples.
      '<atlbase.h>',      # atlapp.h
      '<initguid.h>',     # emi.h
      '<mmdeviceapi.h>',  # functiondiscoverykeys_devpkey.h
      '<ocidl.h>',        # commdlg.h
      '<ole2.h>',         # intshcut.h
      '<shobjidl.h>',     # propkey.h
      '<tchar.h>',        # tpcshrd.h
      '<unknwn.h>',       # intshcut.h
      '<windows.h>',      # hidclass.h, memoryapi.h, ncrypt.h, shellapi.h,
                          # versionhelpers.h, winbase.h, etc.
      '<winsock2.h>',     # ws2tcpip.h
      '<winternl.h>',     # ntsecapi.h; also needs `#define _NTDEF_`
      '<ws2tcpip.h>',     # iphlpapi.h
  ]

  # LINT.ThenChange(/.clang-format:winheader)

  # Ensure that headers are sorted as follows:
  #
  # 1. The primary header, if any, appears first.
  # 2. All headers of the same type (e.g. C system, C++ system headers, et
  #    cetera) are grouped contiguously.
  # 3. Any special sorting rules needed within each group for satisfying
  #    platform header idiosyncrasies. In practice, this only applies to C
  #    system headers.
  # 4. The remaining headers without special sorting rules are sorted
  #    lexicographically.
  #
  # The for loop below that outputs the actual source text depends on #2 above
  # to insert newlines between different groups of headers.
  def SortKey(include):
    def SpecialSortKey(include):
      lower_name = include.decorated_name.lower()
      for i in range(len(special_headers)):
        if special_headers[i] == lower_name:
          return i
      return len(special_headers)

    return (not include.is_primary_header, include.header_type,
            SpecialSortKey(include), include.decorated_name)

  includes.sort(key=SortKey)

  # Assume there's always at least one include.
  previous_include = None
  for include in includes:
    if previous_include and include.ShouldInsertNewline(previous_include):
      source.append('')
    source.extend(include.ToSource())
    previous_include = include
  return source


def AddHeaderToSource(filename, source, decorated_name, remove=False):
  """Adds or removes the specified header into/from the source text, if needed.

  Args:
    filename: The name of the source file.
    source: A string containing the contents of the source file.
    decorated_name: The decorated name of the header to add or remove.
    remove: If true, remove instead of adding.

  Returns:
    None if no changes are needed or the modified source text otherwise.
  """
  lines = source.splitlines()
  begin, end = FindIncludes(lines)

  # No #includes in this file. Just give up.
  # TODO(dcheng): Be more clever and insert it after the file-level comment or
  # include guard as appropriate.
  if begin < 0:
    print(f'Skipping {filename}: unable to find includes!')
    return None

  includes = ParseIncludes(lines[begin:end])
  if not includes:
    print(f'Skipping {filename}: unable to parse includes!')
    return None

  if remove:
    for i in includes:
      if decorated_name == i.decorated_name:
        includes.remove(i)
        break
    else:
      print(f'Skipping {filename}: unable to find {decorated_name}')
      return None
  else:
    if decorated_name in [i.decorated_name for i in includes]:
      # Nothing to do.
      print(f'Skipping {filename}: no changes required!')
      return None
    else:
      includes.append(Include(decorated_name, 'include', [], None))

  MarkPrimaryInclude(includes, filename)

  lines[begin:end] = SerializeIncludes(includes)
  lines.append('')  # To avoid eating the newline at the end of the file.
  return '\n'.join(lines)


def main():
  parser = argparse.ArgumentParser(
      description='Mass add (or remove) a new header into a bunch of files.')
  parser.add_argument(
      '--header',
      help='The decorated filename of the header to insert (e.g. "a" or <a>)',
      required=True)
  parser.add_argument('--remove',
                      help='Remove the header file instead of adding it',
                      action='store_true')
  parser.add_argument('files', nargs='+')
  args = parser.parse_args()
  if ClassifyHeader(args.header) == _HEADER_TYPE_INVALID:
    print('--header argument must be a decorated filename, e.g.')
    print('  --header "<utility>"')
    print('or')
    print('  --header \'"moo.h"\'')
    return 1
  operation = 'Removing' if args.remove else 'Inserting'
  print(f'{operation} #include {args.header}...')
  for filename in args.files:
    with open(filename, 'r') as f:
      new_source = AddHeaderToSource(os.path.normpath(filename), f.read(),
                                     args.header, args.remove)
    if not new_source:
      continue
    with open(filename, 'w', newline='\n') as f:
      f.write(new_source)


if __name__ == '__main__':
  sys.exit(main())