-
Notifications
You must be signed in to change notification settings - Fork 0
/
includegraph.py
executable file
·369 lines (301 loc) · 12.5 KB
/
includegraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#!/usr/bin/env python3
"""Generate the C preprocessor header dependency graph from a Clang compilation database."""
import argparse
import collections
import concurrent.futures
import functools
import json
import logging
import os
import re
import shlex
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Set, TextIO
LOG_LEVELS = {
"CRITICAL": logging.CRITICAL,
"ERROR": logging.ERROR,
"WARNING": logging.WARNING,
"INFO": logging.INFO,
"DEBUG": logging.DEBUG,
}
DEFAULT_LEVEL = "INFO"
# Keys: "directory", "file", "arguments"
CompilationDatabaseEntry = Dict[str, str]
CompilationDatabase = Iterable[CompilationDatabaseEntry]
# Keys: "linenumber" -> str, "filename" -> str, "tags" -> Tuple[int]
Linemarker = Dict[str, str]
@dataclass
class IncludeGraphNode:
"""A node in an include dependency graph.
Represents a file; either a source file, or a header, and several attributes thereof.
"""
# Absolute path
filename: str
# Whether it's a compiled source file, or an included header file
is_source_file: bool = True
# Whether this is a system header
is_system_header: bool = False
# Useful for trimming down the _massive_ system header graph to something useful for a developer
# That is, useful for ignoring system headers included by other system headers.
is_first_level_system_header: bool = False
# compilation_failed: bool
num_in_edges: int = 0
def __hash__(self):
"""Determine node uniqueness only by its filename."""
return hash(self.filename)
def __lt__(self, other):
return self.filename < other.filename
def __repr__(self):
return self.filename
# source -> targets
IncludeGraph = Dict[IncludeGraphNode, Set[IncludeGraphNode]]
def parse_args():
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
"compilation_database",
metavar="compilation-database",
type=str,
help="The path to the compilation database.",
)
parser.add_argument("--jobs", "-j", default=None, type=int, help="Number of parallel jobs")
parser.add_argument(
"--full-system",
action="store_true",
default=False,
help="Output the _full_ system header dependency graph, not just the first level",
)
parser.add_argument(
"--output",
"-o",
type=argparse.FileType("w"),
default=sys.stdout,
help="The file to save the output to. Defaults to stdout.",
)
parser.add_argument(
"--log-level",
"-l",
type=str,
default=DEFAULT_LEVEL,
choices=LOG_LEVELS.keys(),
help=f"Set the logging output level. Defaults to {DEFAULT_LEVEL}.",
)
return parser.parse_args()
def load_compilation_database(compilation_database: Path) -> CompilationDatabase:
"""Load the compilation database from the given path."""
database = None
try:
database = json.load(compilation_database.open())
except json.JSONDecodeError as e:
logging.critical(
"Failed to load compilation database from %s", compilation_database, exc_info=e
)
sys.exit(1)
if not isinstance(database, list):
logging.critical(
"Expected compilation database to be an array of objects. Got: %s", database
)
sys.exit(1)
return database
def normalize_command_to_arguments(
source_entry: CompilationDatabaseEntry,
) -> Optional[CompilationDatabaseEntry]:
"""Normalize and validate the given database entry.
See: https://clang.llvm.org/docs/JSONCompilationDatabase.html
"""
if "directory" not in source_entry:
logging.error("Missing required 'directory' key in %s", source_entry)
return None
if "file" not in source_entry:
logging.error("Missing required 'file' key in %s", source_entry)
return None
if "command" in source_entry:
command = source_entry["command"]
del source_entry["command"]
arguments = shlex.split(command)
source_entry["arguments"] = arguments
if "arguments" not in source_entry:
logging.error("Missing required 'arguments' key in %s", source_entry)
return None
return source_entry
def strip_output_argument(arguments: List[str]) -> List[str]:
"""Strip any "-o" flags (and arguments) to the compiler."""
# Strip both "-o", "value" and "-o=value"
stripped_args = []
arguments = iter(arguments)
for arg in arguments:
if arg.startswith("-o"):
if arg == "-o":
# skip the next argument (the -o flag's value)
next(arguments, None)
# skip this argument
continue
stripped_args.append(arg)
return stripped_args
def invoke_compiler(source_entry: CompilationDatabaseEntry) -> subprocess.Popen:
"""Run the command specified by the compilation database entry."""
directory = source_entry["directory"]
arguments = source_entry["arguments"]
logging.debug("Invoking compiler with: %s", arguments)
# TODO: discard stderr
return subprocess.Popen(arguments, cwd=directory, stdout=subprocess.PIPE)
# See: https://gcc.gnu.org/onlinedocs/cpp/Preprocessor-Output.html
LINEMARKER_PATTERN = re.compile(rb'^#\s+(?P<linenumber>\d+)\s+"(?P<filename>.*)"\s*(?P<flags>.*$)?')
LINEMARKER_FLAG_FILE_START = 1
LINEMARKER_FLAG_FILE_END = 2
LINEMARKER_FLAG_SYSTEM_HEADER = 3
LINEMARKER_FLAG_EXTERN_C = 4
def parse_linemarkers_from_match(match: re.Match) -> Linemarker:
"""Turn the regex matches into a 'nice' data structure."""
parsed = {}
raw = match.groupdict()
parsed["linenumber"] = raw["linenumber"].decode("utf-8")
parsed["filename"] = raw["filename"].decode("utf-8")
flags = raw["flags"].decode("utf-8").split()
parsed["flags"] = tuple(int(f) for f in flags)
return parsed
def parse_linemarkers_from_preprocessor_output(proc: subprocess.Popen) -> Iterable[Linemarker]:
"""Parse the preprocessor linemarkers from the compiler stdout output."""
for line in proc.stdout:
match = LINEMARKER_PATTERN.match(line)
if match is not None:
linemarker = parse_linemarkers_from_match(match)
yield linemarker
proc.wait()
if proc.returncode != 0:
logging.error("Failed on args: %s", proc.args)
# sys.exit(1)
def preprocess_source_file(source_entry: CompilationDatabaseEntry) -> Iterable[Linemarker]:
"""Invoke the preprocessor and parse its stdout to build the include graph.
Assumes "-o" has been removed from the arguments and "-E" has been added. Munges through the
compiler's stdout to find and parse preprocessor linemarkers.
"""
proc = invoke_compiler(source_entry)
linemarkers = parse_linemarkers_from_preprocessor_output(proc)
return linemarkers
def get_tu_linemarkers(source_entry: CompilationDatabaseEntry) -> Iterable[Linemarker]:
"""Get the preprocessor linemarkers from the given translation unit database entry."""
# Normalize "command" -> "arguments"
source_entry = normalize_command_to_arguments(source_entry)
# Strip out -o so that we can parse the stdout output with the linemarkers
source_entry["arguments"] = strip_output_argument(source_entry["arguments"])
# Instrument with -E
source_entry["arguments"] += ["-E"]
# Parse compiler output
linemarkers = preprocess_source_file(source_entry)
return linemarkers
def get_project_linemarkers(database: CompilationDatabase) -> Iterable[Linemarker]:
"""Get the linemarkers from the given compilation database."""
for entry in database:
entry_linemarkers = get_tu_linemarkers(entry)
# Mark the start of a new translation unit with a sentinel value
yield None
yield from entry_linemarkers
def build_header_dependency_graph(
linemarkers: Iterable[Linemarker], full_system: bool
) -> IncludeGraph:
"""Build a dependency graph from a set of preprocessor linemarkers."""
graph = collections.defaultdict(set)
stack: List[IncludeGraphNode] = []
for linemarker in linemarkers:
if linemarker is None:
stack.clear()
continue
filename = linemarker["filename"]
flags = linemarker["flags"]
current_node = IncludeGraphNode(filename=filename)
current_node.is_system_header = 3 in flags
current_node.is_first_level_system_header = current_node.is_system_header
if current_node.is_system_header and stack and stack[-1].is_system_header:
current_node.is_first_level_system_header = False
current_node.is_source_file = False
# The start of a new translation unit
if not stack:
current_node.is_source_file = True
stack.append(current_node)
if current_node not in graph:
graph[current_node] = set()
# Ignore the linemarkers without flags. They either seem to be <built-in>, <command-line>,
# or a duplicate of the start of the translation unit.
if not flags:
continue
if 1 in flags:
source = stack[-1]
target = current_node
stack.append(current_node)
if (
full_system
or not current_node.is_system_header
or current_node.is_first_level_system_header
):
logging.debug("Adding: %s -> %s", source, target)
graph[source].add(target)
# Need to ensure that every node is added to the graph as a proper source node, not
# just a target.
if target not in graph:
graph[target] = set()
if 2 in flags:
_ = stack.pop()
return graph
def build_graph_for_tu(entry: CompilationDatabaseEntry, idx, total, full_system) -> IncludeGraph:
logging.info("(%d/%d) Processing dependencies for '%s'...", idx, total, entry["file"])
linemarkers = get_tu_linemarkers(entry)
graph = build_header_dependency_graph(linemarkers, full_system)
logging.debug("(%d/%d) Processed dependencies for '%s'.", idx, total, entry["file"])
return graph
def build_graphs_in_parallel(
database: CompilationDatabase, full_system: bool, jobs: int
) -> Iterable[IncludeGraph]:
executor = concurrent.futures.ThreadPoolExecutor(max_workers=jobs)
total = len(database)
futures = {
executor.submit(build_graph_for_tu, entry, idx + 1, total, full_system): entry
for idx, entry in enumerate(database)
}
for future in concurrent.futures.as_completed(futures):
entry = futures[future]
try:
subgraph = future.result()
yield subgraph
except BaseException as e:
logging.error("Failed to generate dependency graph for '%s'", entry["file"], exc_info=e)
def merge_two_graphs(lhs: IncludeGraph, rhs: IncludeGraph) -> IncludeGraph:
result = lhs
for key, value in rhs.items():
result[key] = result[key].union(value)
return result
def merge_graphs(subgraphs: Iterable[IncludeGraph]) -> IncludeGraph:
return functools.reduce(merge_two_graphs, subgraphs)
def output_dep_graph_tgf(graph: IncludeGraph, output: TextIO):
"""Output the include graph in TGF format."""
node: IncludeGraphNode
for node in sorted(graph.keys()):
# attributes aren't allowed to have commas.
attributes = f"is_source_file={node.is_source_file}, is_system_header={node.is_system_header}, is_first_level_system_header={node.is_first_level_system_header}"
print(f'"{node.filename}"\t"{attributes}"', file=output)
print("#", file=output)
source: IncludeGraphNode
targets: Set[IncludeGraphNode]
for source, targets in graph.items():
for target in sorted(targets):
print(f'"{source.filename}"\t"{target.filename}"', file=output)
def main(args):
database_path = Path(args.compilation_database)
database = load_compilation_database(database_path)
logging.debug("Successfully loaded compilation database from '%s'", database_path)
jobs = args.jobs or os.cpu_count()
subgraphs = build_graphs_in_parallel(database, args.full_system, jobs)
graph = merge_graphs(subgraphs)
output_dep_graph_tgf(graph, args.output)
if __name__ == "__main__":
args = parse_args()
logging.basicConfig(
format="%(asctime)s - %(module)s - %(levelname)s - %(message)s",
level=LOG_LEVELS.get(args.log_level),
stream=sys.stderr,
)
main(args)