1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221
  222
  223
  224
  225
  226
  227
  228
  229
  230
  231
  232
  233
  234
  235
  236
  237
  238
  239
  240
  241
  242
  243
  244
  245
  246
  247
  248
  249
  250
  251
  252
  253
  254
  255
  256
  257
  258
  259
  260
  261
  262
  263
  264
  265
  266
  267
  268
  269
  270
  271
  272
  273
  274
  275
  276
  277
  278
  279
  280
  281
  282
  283
  284
  285
  286

build / toolchain / win / ml.py [blame]

#!/usr/bin/env python3
# Copyright 2018 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Wraps ml.exe or ml64.exe and postprocesses the output to be deterministic.
Sets timestamp in .obj file to 0, hence incompatible with link.exe /incremental.

Use by prefixing the ml(64).exe invocation with this script:
    python ml.py ml.exe [args...]"""

import array
import collections
import struct
import subprocess
import sys


class Struct(object):
  """A thin wrapper around the struct module that returns a namedtuple"""

  def __init__(self, name, *args):
    """Pass the name of the return type, and then an interleaved list of
    format strings as used by the struct module and of field names."""
    self.fmt = '<' + ''.join(args[0::2])
    self.type = collections.namedtuple(name, args[1::2])

  def pack_into(self, buffer, offset, data):
    return struct.pack_into(self.fmt, buffer, offset, *data)

  def unpack_from(self, buffer, offset=0):
    return self.type(*struct.unpack_from(self.fmt, buffer, offset))

  def size(self):
    return struct.calcsize(self.fmt)


def Subtract(nt, **kwargs):
  """Subtract(nt, f=2) returns a new namedtuple with 2 subtracted from nt.f"""
  return nt._replace(**{k: getattr(nt, k) - v for k, v in kwargs.items()})


def MakeDeterministic(objdata):
  # Takes data produced by ml(64).exe (without any special flags) and
  # 1. Sets the timestamp to 0
  # 2. Strips the .debug$S section (which contains an unwanted absolute path)

  # This makes several assumptions about ml's output:
  # - Section data is in the same order as the corresponding section headers:
  #   section headers preceding the .debug$S section header have their data
  #   preceding the .debug$S section data; likewise for section headers
  #   following the .debug$S section.
  # - The .debug$S section contains only the absolute path to the obj file and
  #   nothing else, in particular there's only a single entry in the symbol
  #   table referring to the .debug$S section.
  # - There are no COFF line number entries.
  # - There's no IMAGE_SYM_CLASS_CLR_TOKEN symbol.
  # These seem to hold in practice; if they stop holding this script needs to
  # become smarter.

  objdata = array.array('b', objdata)  # Writable, e.g. via struct.pack_into.

  # Read coff header.
  COFFHEADER = Struct('COFFHEADER', 'H', 'Machine', 'H', 'NumberOfSections',
                      'I', 'TimeDateStamp', 'I', 'PointerToSymbolTable', 'I',
                      'NumberOfSymbols', 'H', 'SizeOfOptionalHeader', 'H',
                      'Characteristics')
  coff_header = COFFHEADER.unpack_from(objdata)
  assert coff_header.SizeOfOptionalHeader == 0  # Only set for binaries.

  # Read section headers following coff header.
  SECTIONHEADER = Struct('SECTIONHEADER', '8s', 'Name', 'I', 'VirtualSize', 'I',
                         'VirtualAddress', 'I', 'SizeOfRawData', 'I',
                         'PointerToRawData', 'I', 'PointerToRelocations', 'I',
                         'PointerToLineNumbers', 'H', 'NumberOfRelocations',
                         'H', 'NumberOfLineNumbers', 'I', 'Characteristics')
  section_headers = []
  debug_section_index = -1
  for i in range(0, coff_header.NumberOfSections):
    section_header = SECTIONHEADER.unpack_from(objdata,
                                               offset=COFFHEADER.size() +
                                               i * SECTIONHEADER.size())
    assert not section_header[0].startswith(b'/')  # Support short names only.
    section_headers.append(section_header)

    if section_header.Name == b'.debug$S':
      assert debug_section_index == -1
      debug_section_index = i
  assert debug_section_index != -1

  data_start = COFFHEADER.size() + len(section_headers) * SECTIONHEADER.size()

  # Verify the .debug$S section looks like we expect.
  assert section_headers[debug_section_index].Name == b'.debug$S'
  assert section_headers[debug_section_index].VirtualSize == 0
  assert section_headers[debug_section_index].VirtualAddress == 0
  debug_size = section_headers[debug_section_index].SizeOfRawData
  debug_offset = section_headers[debug_section_index].PointerToRawData
  assert section_headers[debug_section_index].PointerToRelocations == 0
  assert section_headers[debug_section_index].PointerToLineNumbers == 0
  assert section_headers[debug_section_index].NumberOfRelocations == 0
  assert section_headers[debug_section_index].NumberOfLineNumbers == 0

  # Make sure sections in front of .debug$S have their data preceding it.
  for header in section_headers[:debug_section_index]:
    assert header.PointerToRawData < debug_offset
    assert header.PointerToRelocations < debug_offset
    assert header.PointerToLineNumbers < debug_offset

  # Make sure sections after of .debug$S have their data following it.
  for header in section_headers[debug_section_index + 1:]:
    # Make sure the .debug$S data is at the very end of section data:
    assert header.PointerToRawData > debug_offset
    assert header.PointerToRelocations == 0
    assert header.PointerToLineNumbers == 0

  # Make sure the first non-empty section's data starts right after the section
  # headers.
  for section_header in section_headers:
    if section_header.PointerToRawData == 0:
      assert section_header.PointerToRelocations == 0
      assert section_header.PointerToLineNumbers == 0
      continue
    assert section_header.PointerToRawData == data_start
    break

  # Make sure the symbol table (and hence, string table) appear after the last
  # section:
  assert (
      coff_header.PointerToSymbolTable >=
      section_headers[-1].PointerToRawData + section_headers[-1].SizeOfRawData)

  # The symbol table contains a symbol for the no-longer-present .debug$S
  # section. If we leave it there, lld-link will complain:
  #
  #    lld-link: error: .debug$S should not refer to non-existent section 5
  #
  # so we need to remove that symbol table entry as well. This shifts symbol
  # entries around and we need to update symbol table indices in:
  # - relocations
  # - line number records (never present)
  # - one aux symbol entry (IMAGE_SYM_CLASS_CLR_TOKEN; not present in ml output)
  SYM = Struct(
      'SYM',
      '8s',
      'Name',
      'I',
      'Value',
      'h',
      'SectionNumber',  # Note: Signed!
      'H',
      'Type',
      'B',
      'StorageClass',
      'B',
      'NumberOfAuxSymbols')
  i = 0
  debug_sym = -1
  while i < coff_header.NumberOfSymbols:
    sym_offset = coff_header.PointerToSymbolTable + i * SYM.size()
    sym = SYM.unpack_from(objdata, sym_offset)

    # 107 is IMAGE_SYM_CLASS_CLR_TOKEN, which has aux entry "CLR Token
    # Definition", which contains a symbol index. Check it's never present.
    assert sym.StorageClass != 107

    # Note: sym.SectionNumber is 1-based, debug_section_index is 0-based.
    if sym.SectionNumber - 1 == debug_section_index:
      assert debug_sym == -1, 'more than one .debug$S symbol found'
      debug_sym = i
      # Make sure the .debug$S symbol looks like we expect.
      # In particular, it should have exactly one aux symbol.
      assert sym.Name == b'.debug$S'
      assert sym.Value == 0
      assert sym.Type == 0
      assert sym.StorageClass == 3
      assert sym.NumberOfAuxSymbols == 1
    elif sym.SectionNumber > debug_section_index:
      sym = Subtract(sym, SectionNumber=1)
      SYM.pack_into(objdata, sym_offset, sym)
    i += 1 + sym.NumberOfAuxSymbols
  assert debug_sym != -1, '.debug$S symbol not found'

  # Note: Usually the .debug$S section is the last, but for files saying
  # `includelib foo.lib`, like safe_terminate_process.asm in 32-bit builds,
  # this isn't true: .drectve is after .debug$S.

  # Update symbol table indices in relocations.
  # There are a few processor types that have one or two relocation types
  # where SymbolTableIndex has a different meaning, but not for x86.
  REL = Struct('REL', 'I', 'VirtualAddress', 'I', 'SymbolTableIndex', 'H',
               'Type')
  for header in section_headers[0:debug_section_index]:
    for j in range(0, header.NumberOfRelocations):
      rel_offset = header.PointerToRelocations + j * REL.size()
      rel = REL.unpack_from(objdata, rel_offset)
      assert rel.SymbolTableIndex != debug_sym
      if rel.SymbolTableIndex > debug_sym:
        rel = Subtract(rel, SymbolTableIndex=2)
        REL.pack_into(objdata, rel_offset, rel)

  # Update symbol table indices in line numbers -- just check they don't exist.
  for header in section_headers:
    assert header.NumberOfLineNumbers == 0

  # Now that all indices are updated, remove the symbol table entry referring to
  # .debug$S and its aux entry.
  del objdata[coff_header.PointerToSymbolTable +
              debug_sym * SYM.size():coff_header.PointerToSymbolTable +
              (debug_sym + 2) * SYM.size()]

  # Now we know that it's safe to write out the input data, with just the
  # timestamp overwritten to 0, the last section header cut out (and the
  # offsets of all other section headers decremented by the size of that
  # one section header), and the last section's data cut out. The symbol
  # table offset needs to be reduced by one section header and the size of
  # the missing section.
  # (The COFF spec only requires on-disk sections to be aligned in image files,
  # for obj files it's not required. If that wasn't the case, deleting slices
  # if data would not generally be safe.)

  # Update section offsets and remove .debug$S section data.
  for i in range(0, debug_section_index):
    header = section_headers[i]
    if header.SizeOfRawData:
      header = Subtract(header, PointerToRawData=SECTIONHEADER.size())
    if header.NumberOfRelocations:
      header = Subtract(header, PointerToRelocations=SECTIONHEADER.size())
    if header.NumberOfLineNumbers:
      header = Subtract(header, PointerToLineNumbers=SECTIONHEADER.size())
    SECTIONHEADER.pack_into(objdata,
                            COFFHEADER.size() + i * SECTIONHEADER.size(),
                            header)
  for i in range(debug_section_index + 1, len(section_headers)):
    header = section_headers[i]
    shift = SECTIONHEADER.size() + debug_size
    if header.SizeOfRawData:
      header = Subtract(header, PointerToRawData=shift)
    if header.NumberOfRelocations:
      header = Subtract(header, PointerToRelocations=shift)
    if header.NumberOfLineNumbers:
      header = Subtract(header, PointerToLineNumbers=shift)
    SECTIONHEADER.pack_into(objdata,
                            COFFHEADER.size() + i * SECTIONHEADER.size(),
                            header)

  del objdata[debug_offset:debug_offset + debug_size]

  # Finally, remove .debug$S section header and update coff header.
  coff_header = coff_header._replace(TimeDateStamp=0)
  coff_header = Subtract(coff_header,
                         NumberOfSections=1,
                         PointerToSymbolTable=SECTIONHEADER.size() + debug_size,
                         NumberOfSymbols=2)
  COFFHEADER.pack_into(objdata, 0, coff_header)

  del objdata[COFFHEADER.size() +
              debug_section_index * SECTIONHEADER.size():COFFHEADER.size() +
              (debug_section_index + 1) * SECTIONHEADER.size()]

  # All done!
  if sys.version_info.major == 2:
    return objdata.tostring()
  else:
    return objdata.tobytes()


def main():
  ml_result = subprocess.call(sys.argv[1:])
  if ml_result != 0:
    return ml_result

  objfile = None
  for i in range(1, len(sys.argv)):
    if sys.argv[i].startswith('/Fo'):
      objfile = sys.argv[i][len('/Fo'):]
  assert objfile, 'failed to find ml output'

  with open(objfile, 'rb') as f:
    objdata = f.read()
  objdata = MakeDeterministic(objdata)
  with open(objfile, 'wb') as f:
    f.write(objdata)


if __name__ == '__main__':
  sys.exit(main())