1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
media / ffmpeg / scripts / find_patches.py [blame]
#!/usr/bin/env python3
#
# Copyright 2018 The Chromium Authors.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
#
"""Usage: find_patches.py [origin_branch] [> patch_file]
This will find all changes in |origin_branch| that are not part of upstream,
and print a report. It tries to include deleted lines, though these are
heuristic at best. If |origin_branch| is omitted, it will default to HEAD.
Changes in the working directory are ignored.
Output will be written to stdout, so you probably want to redirect it.
For example, to generate the patches file for origin/merge-m68:
find_patches.py origin/merge-m68 > patches.68
"""
import collections
import os
import re
import sys
import subprocess
from robo_lib import shell
# What directory will we look for patches in?
# TODO(liberato): Should we find the root of the ffmpeg tree?
PATH = "."
def log(msg):
print(f"[{msg}]", file=sys.stderr)
class PatchInfo:
""" Structure to keep track of one patch in a diff.
This class encapsulates how to handle inserted / deleted lines in a patch,
mostly so that we can decide if we should apply "deleted lines only"
processing to any them, to find what commit deleted them. Because deleted
lines result in an approximate search, we want to be reasonably sure that
any deleted lines aren't actually just changes ("delete old, add new").
"""
def __init__(self):
# Does a diff insert any lines?
self._does_insert = False
# Set of lines that a diff deletes.
self._deleted_lines = set()
# Is the patch a whole-file delete in origin of an upstream file?
self._is_delete_of_file_in_origin = False
def record_inserted_line(self, line):
""" Records that |line| was inserted as part of the patch.
|line| is a string from the patch, e.g., "+ foo that was added;"
"""
self._does_insert = True
def record_deleted_line(self, line):
""" Records that |line| was deleted as part of the patch.
|line| is a string from the patch, e.g., "- foo that was removed;"
"""
self._deleted_lines.add(line)
def interesting_deleted_lines(self):
""" Return the (possibly empty) set of deleted lines that we should track.
In general, things that remove but also add probably are changes, and
can be ignored as noise. While, with perfect deleted line tracking,
this wouldn't actually change the result, we really just do a text
search for deleted lines later. So, avoiding noise is good.
Note that this is approximate -- a diff could have deleted and
inserted lines near each other, but from different patches. In other
words, patch A could delete lines and patch B could add / change them.
If those changes end up in the same diff block, then we'll miss A
because of this test. However, in practice, checking for both seems
to remove some noise.
Also note that, if the patch is a whole-file deletion in origin, then none
of the missing lines are interesting. The whole file is gone, and it'll
be handled specially elsewhere, rather than trying to 'git blame' specific
deleted lines.
"""
if self._deleted_lines and not self._does_insert and not self._is_delete_of_file_in_origin:
return self._deleted_lines
return set()
def set_is_delete_of_file_in_origin(self):
""" Records that this patch is a whole-file deletion."""
self._is_delete_of_file_in_origin = True
def is_delete_of_file_in_origin(self):
return self._is_delete_of_file_in_origin
def main(argv):
# Origin branch that contains the patches we want to find.
# Can specify, for example "origin/merge-m68" to get the patches file for
# that revision, regardless of the state of the working tree.
if len(argv) > 1:
origin_branch = argv[1]
else:
origin_branch = "HEAD"
# Make sure that upstream is up-to-date, else many things will likely not
# be reachable from it. We don't do this if run as part of a script.
if subprocess.call(["git", "fetch", "upstream"]):
raise Exception("Could not fetch from upstream")
write_patches_file(origin_branch, sys.stdout)
def write_patches_file(origin_branch, output_file):
"""Write the patches file for |origin_branch| to |output_file|."""
# Get the latest upstream commit that's reachable from the origin branch.
# We'll use that to compare against.
upstream = shell.output_or_error(
["git", "merge-base", "upstream/master", origin_branch]) # nocheck
if not upstream:
raise Exception("Could not find upstream commit")
# "Everything reachable from |origin_branch| but not |upstream|". In other
# words, all and only chromium changes. Note that there are non-chromium
# authors here, since it will include cherry-picks to origin.
revision_range = "%s..%s" % (upstream, origin_branch)
log("Origin is %s" % origin_branch)
log("Upstream is %s" % upstream)
# Find diffs between the versions, excluding all files that are only on
# origin. We explicitly exclude .gitignore, since it exists in both places.
# Ask for no context, since we ignore it anyway.
diff = shell.output_or_error([
"git", "diff", "--diff-filter=a", "-U0", revision_range, PATH,
":!.gitignore"
])
# Set of chromium patch sha1s we've seen.
sha1s = set()
# Map of sha1 to set of files that it affects.
sha1ToFiles = collections.defaultdict(set)
# Mapping of filename to set of lines that were deleted.
files_to_deleted_lines = {}
patch_info = PatchInfo()
filename = None
# Files that were deleted in origin but exist in upstream.
files_deleted_in_origin = set()
last_minus_file = None
# Process each diff. Include a dummy line to flush out the last diff.
log("Scanning diffs between origin and upstream")
for line in diff.splitlines() + [
"+++ just to handle deleted lines properly"
]:
if line.startswith("+++"):
# If the previous patch was delete-only, then we need to search for it
# differently, since we don't get blame entries for deleted lines.
# Add the set of deleted lines to this filename. Remember that whole-file
# deletions have no interesting deleted lines, and are handled even more
# differently than deleted lines inside the file.
deleted_lines = patch_info.interesting_deleted_lines()
if deleted_lines:
files_to_deleted_lines[filename] = deleted_lines
# Start of a new diff. We don't know if it inserts / deletes lines.
patch_info = PatchInfo()
# Update to the new filename.
# If the line is "+++ /dev/null", then it means that chromium deleted the
# file, while upstream has it. Note that it does not contain the "a/"
# or "b/" that we'd expect, so line[6:] would grab "ev/null" if we didn't
# handle it specially.
if "/dev/null" in line:
files_deleted_in_origin.add(last_minus_file)
patch_info.set_is_delete_of_file_in_origin()
log("File was deleted in origin: %s" % last_minus_file)
else:
filename = line[6:]
log("Checking diffs in %s" % filename)
elif line.startswith(
"@@") and not patch_info.is_delete_of_file_in_origin():
# @@ -linespec +linespec @@
# linespec is either "line_number,number_of_lines" or "line_number".
# Extract the "+linespec", which is what was added by |origin|.
# If the number of lines is specified as 0, then it's a deletion only.
# If the number of lines is unspecified, then it's 1.
# If the diff is a whole-file delete, then don't do any of this, since
# it's not going to work anyway. We'll look up the whole file later.
added_linespec = re.sub(r"^.*\+(.*) @@.*", r"\1", line)
# Figure out the lines to blame. This is just "starting_line,+number".
if "," in added_linespec:
# linespec is "line_number,number_of_lines"
added_parts = added_linespec.split(",")
# Skip if this is a deletion.
if added_parts[1] == "0":
continue
blame_range = "%s,+%s" % (added_parts[0], added_parts[1])
else:
# One-line change
blame_range = "%s,+1" % added_linespec
blame = shell.output_or_error([
"git", "blame", "-l",
"-L %s" % blame_range, revision_range, "--", filename
])
# Collect sha1 lines, and create a mapping of files that is changed by
# each sha1.
for blame_line in blame.splitlines():
sha1 = blame_line.split(" ", 1)[0]
if sha1:
sha1s.add(sha1)
sha1ToFiles[sha1].add(filename)
elif line.startswith("---"):
# Do nothing. Just avoid matching "---" when we check for "-"
# Record the filename, though, in case we deleted it in origin. We won't
# get the filename in the upcoming +++; it'll be "+++ /dev/null".
last_minus_file = line[6:]
pass
elif line.startswith("-"):
# This diff does delete lines.
patch_info.record_deleted_line(line[1:])
elif line.startswith("+"):
# This diff does insert lines.
patch_info.record_inserted_line(line[1:])
# For all files that have deleted lines, look for the sha1 that deleted them.
# This is heuristic only; we're looking for "commits that contain some text".
for filename, deleted_lines in files_to_deleted_lines.items():
for deleted_line in deleted_lines:
# Make sure that the deleted line is long enough to provide context.
if len(deleted_line) < 4:
continue
# git log freaks out if you search for a line starting with #, remove it.
while deleted_line.startswith('#'):
deleted_line = deleted_line[1:]
log("Checking for deleted lines in %s" % filename)
# Specify "--first-parent" so that we find commits on (presumably) origin.
sha1 = shell.output_or_error([
"git", "log", "-1", revision_range, "--format=%H", "-S",
deleted_line, origin_branch, "--", filename
])
# Add the sha1 to the sets
sha1s.add(sha1)
sha1ToFiles[sha1].add(filename)
# Find which commit deleted each file in origin.
for filename in files_deleted_in_origin:
log("Finding commit that deleted %s" % filename)
sha1 = shell.output_or_error(
["git", "log", "-1", origin_branch, "--format=%H", "--", filename])
sha1s.add(sha1)
sha1ToFiles[sha1].add(filename)
# Look up dates from sha1 hashes. We want to output them in a canonical order
# so that we can diff easier. Date order seems more convenient that sha1.
log("Looking up sha1 dates to sort them")
sha1_to_date = {}
for sha1 in sha1s:
date = shell.output_or_error(
["git", "log", "-1", "--format=%at",
"%s" % sha1])
sha1_to_date[sha1] = date
# Print the patches file.
log("Writing patch file")
print(
"---------------------------------------------------------------------",
file=output_file)
print("-- Chromium Patches. Autogenerated by " +
os.path.basename(__file__) + ", do not edit --",
file=output_file)
print(
"---------------------------------------------------------------------",
file=output_file)
print("\n", file=output_file)
wd = os.getcwd()
for sha1, date in sorted(sha1_to_date.items(), key=lambda kv: kv[1]):
print(
"------------------------------------------------------------------",
file=output_file)
loglines = shell.output_or_error(["git", "log", "-1", "%s" % sha1])
for line in loglines.splitlines():
print(line.rstrip(), file=output_file)
print("\nAffects:", file=output_file)
# TODO(liberato): maybe add the lines that were affected.
for file in sorted(sha1ToFiles[sha1]):
print(" " + os.path.relpath(file, wd), file=output_file)
print(file=output_file)
log("Done")
if __name__ == "__main__":
main(sys.argv)