scripts/perf.py - third_party/github/littlefs-project/littlefs - Git at Google

 #!/usr/bin/env python3
 #
 # Script to aggregate and report Linux perf results.
 #
 # Example:
 # ./scripts/perf.py -R -obench.perf ./runners/bench_runner
 # ./scripts/perf.py bench.perf -j -Flfs.c -Flfs_util.c -Scycles
 #
 # Copyright (c) 2022, The littlefs authors.
 # SPDX-License-Identifier: BSD-3-Clause
 #

 import bisect
 import collections as co
 import csv
 import errno
 import fcntl
 import functools as ft
 import itertools as it
 import math as m
 import multiprocessing as mp
 import os
 import re
 import shlex
 import shutil
 import subprocess as sp
 import tempfile
 import zipfile

 # TODO support non-zip perf results?


 PERF_PATH = ['perf']
 PERF_EVENTS = 'cycles,branch-misses,branches,cache-misses,cache-references'
 PERF_FREQ = 100
 OBJDUMP_PATH = ['objdump']
 THRESHOLD = (0.5, 0.85)


 # integer fields
 class Int(co.namedtuple('Int', 'x')):
     __slots__ = ()
     def __new__(cls, x=0):
         if isinstance(x, Int):
             return x
         if isinstance(x, str):
             try:
                 x = int(x, 0)
             except ValueError:
                 # also accept +-∞ and +-inf
                 if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
                     x = m.inf
                 elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
                     x = -m.inf
                 else:
                     raise
         assert isinstance(x, int) or m.isinf(x), x
         return super().__new__(cls, x)

     def __str__(self):
         if self.x == m.inf:
             return '∞'
         elif self.x == -m.inf:
             return '-∞'
         else:
             return str(self.x)

     def __int__(self):
         assert not m.isinf(self.x)
         return self.x

     def __float__(self):
         return float(self.x)

     none = '%7s' % '-'
     def table(self):
         return '%7s' % (self,)

     diff_none = '%7s' % '-'
     diff_table = table

     def diff_diff(self, other):
         new = self.x if self else 0
         old = other.x if other else 0
         diff = new - old
         if diff == +m.inf:
             return '%7s' % '+∞'
         elif diff == -m.inf:
             return '%7s' % '-∞'
         else:
             return '%+7d' % diff

     def ratio(self, other):
         new = self.x if self else 0
         old = other.x if other else 0
         if m.isinf(new) and m.isinf(old):
             return 0.0
         elif m.isinf(new):
             return +m.inf
         elif m.isinf(old):
             return -m.inf
         elif not old and not new:
             return 0.0
         elif not old:
             return 1.0
         else:
             return (new-old) / old

     def __add__(self, other):
         return self.__class__(self.x + other.x)

     def __sub__(self, other):
         return self.__class__(self.x - other.x)

     def __mul__(self, other):
         return self.__class__(self.x * other.x)

 # perf results
 class PerfResult(co.namedtuple('PerfResult', [
         'file', 'function', 'line',
         'cycles', 'bmisses', 'branches', 'cmisses', 'caches',
         'children'])):
     _by = ['file', 'function', 'line']
     _fields = ['cycles', 'bmisses', 'branches', 'cmisses', 'caches']
     _sort = ['cycles', 'bmisses', 'cmisses', 'branches', 'caches']
     _types = {
         'cycles': Int,
         'bmisses': Int, 'branches': Int,
         'cmisses': Int, 'caches': Int}

     __slots__ = ()
     def __new__(cls, file='', function='', line=0,
             cycles=0, bmisses=0, branches=0, cmisses=0, caches=0,
             children=[]):
         return super().__new__(cls, file, function, int(Int(line)),
             Int(cycles), Int(bmisses), Int(branches), Int(cmisses), Int(caches),
             children)

     def __add__(self, other):
         return PerfResult(self.file, self.function, self.line,
             self.cycles + other.cycles,
             self.bmisses + other.bmisses,
             self.branches + other.branches,
             self.cmisses + other.cmisses,
             self.caches + other.caches,
             self.children + other.children)


 def openio(path, mode='r', buffering=-1):
     # allow '-' for stdin/stdout
     if path == '-':
         if mode == 'r':
             return os.fdopen(os.dup(sys.stdin.fileno()), mode, buffering)
         else:
             return os.fdopen(os.dup(sys.stdout.fileno()), mode, buffering)
     else:
         return open(path, mode, buffering)

 # run perf as a subprocess, storing measurements into a zip file
 def record(command, *,
         output=None,
         perf_freq=PERF_FREQ,
         perf_period=None,
         perf_events=PERF_EVENTS,
         perf_path=PERF_PATH,
         **args):
     # create a temporary file for perf to write to, as far as I can tell
     # this is strictly needed because perf's pipe-mode only works with stdout
     with tempfile.NamedTemporaryFile('rb') as f:
         # figure out our perf invocation
         perf = perf_path + list(filter(None, [
             'record',
             '-F%s' % perf_freq
                 if perf_freq is not None
                 and perf_period is None else None,
             '-c%s' % perf_period
                 if perf_period is not None else None,
             '-B',
             '-g',
             '--all-user',
             '-e%s' % perf_events,
             '-o%s' % f.name]))

         # run our command
         try:
             if args.get('verbose'):
                 print(' '.join(shlex.quote(c) for c in perf + command))
             err = sp.call(perf + command, close_fds=False)

         except KeyboardInterrupt:
             err = errno.EOWNERDEAD

         # synchronize access
         z = os.open(output, os.O_RDWR | os.O_CREAT)
         fcntl.flock(z, fcntl.LOCK_EX)

         # copy measurements into our zip file
         with os.fdopen(z, 'r+b') as z:
             with zipfile.ZipFile(z, 'a',
                     compression=zipfile.ZIP_DEFLATED,
                     compresslevel=1) as z:
                 with z.open('perf.%d' % os.getpid(), 'w') as g:
                     shutil.copyfileobj(f, g)

     # forward the return code
     return err


 # try to only process each dso onceS
 #
 # note this only caches with the non-keyword arguments
 def multiprocessing_cache(f):
     local_cache = {}
     manager = mp.Manager()
     global_cache = manager.dict()
     lock = mp.Lock()

     def multiprocessing_cache(*args, **kwargs):
         # check local cache?
         if args in local_cache:
             return local_cache[args]
         # check global cache?
         with lock:
             if args in global_cache:
                 v = global_cache[args]
                 local_cache[args] = v
                 return v
             # fall back to calling the function
             v = f(*args, **kwargs)
             global_cache[args] = v
             local_cache[args] = v
             return v

     return multiprocessing_cache

 @multiprocessing_cache
 def collect_syms_and_lines(obj_path, *,
         objdump_path=None,
         **args):
     symbol_pattern = re.compile(
         '^(?P<addr>[0-9a-fA-F]+)'
             '\s+.*'
             '\s+(?P<size>[0-9a-fA-F]+)'
             '\s+(?P<name>[^\s]+)\s*$')
     line_pattern = re.compile(
         '^\s+(?:'
             # matches dir/file table
             '(?P<no>[0-9]+)'
                 '(?:\s+(?P<dir>[0-9]+))?'
                 '\s+.*'
                 '\s+(?P<path>[^\s]+)'
             # matches line opcodes
             '|' '\[[^\]]*\]\s+'
                 '(?:'
                     '(?P<op_special>Special)'
                     '|' '(?P<op_copy>Copy)'
                     '|' '(?P<op_end>End of Sequence)'
                     '|' 'File .*?to (?:entry )?(?P<op_file>\d+)'
                     '|' 'Line .*?to (?P<op_line>[0-9]+)'
                     '|' '(?:Address|PC) .*?to (?P<op_addr>[0x0-9a-fA-F]+)'
                     '|' '.' ')*'
             ')$', re.IGNORECASE)

     # figure out symbol addresses and file+line ranges
     syms = {}
     sym_at = []
     cmd = objdump_path + ['-t', obj_path]
     if args.get('verbose'):
         print(' '.join(shlex.quote(c) for c in cmd))
     proc = sp.Popen(cmd,
         stdout=sp.PIPE,
         stderr=sp.PIPE if not args.get('verbose') else None,
         universal_newlines=True,
         errors='replace',
         close_fds=False)
     for line in proc.stdout:
         m = symbol_pattern.match(line)
         if m:
             name = m.group('name')
             addr = int(m.group('addr'), 16)
             size = int(m.group('size'), 16)
             # ignore zero-sized symbols
             if not size:
                 continue
             # note multiple symbols can share a name
             if name not in syms:
                 syms[name] = set()
             syms[name].add((addr, size))
             sym_at.append((addr, name, size))
     proc.wait()
     if proc.returncode != 0:
         if not args.get('verbose'):
             for line in proc.stderr:
                 sys.stdout.write(line)
         # assume no debug-info on failure
         pass

     # sort and keep largest/first when duplicates
     sym_at.sort(key=lambda x: (x[0], -x[2], x[1]))
     sym_at_ = []
     for addr, name, size in sym_at:
         if len(sym_at_) == 0 or sym_at_[-1][0] != addr:
             sym_at_.append((addr, name, size))
     sym_at = sym_at_

     # state machine for dwarf line numbers, note that objdump's
     # decodedline seems to have issues with multiple dir/file
     # tables, which is why we need this
     lines = []
     line_at = []
     dirs = {}
     files = {}
     op_file = 1
     op_line = 1
     op_addr = 0
     cmd = objdump_path + ['--dwarf=rawline', obj_path]
     if args.get('verbose'):
         print(' '.join(shlex.quote(c) for c in cmd))
     proc = sp.Popen(cmd,
         stdout=sp.PIPE,
         stderr=sp.PIPE if not args.get('verbose') else None,
         universal_newlines=True,
         errors='replace',
         close_fds=False)
     for line in proc.stdout:
         m = line_pattern.match(line)
         if m:
             if m.group('no') and not m.group('dir'):
                 # found a directory entry
                 dirs[int(m.group('no'))] = m.group('path')
             elif m.group('no'):
                 # found a file entry
                 dir = int(m.group('dir'))
                 if dir in dirs:
                     files[int(m.group('no'))] = os.path.join(
                         dirs[dir],
                         m.group('path'))
                 else:
                     files[int(m.group('no'))] = m.group('path')
             else:
                 # found a state machine update
                 if m.group('op_file'):
                     op_file = int(m.group('op_file'), 0)
                 if m.group('op_line'):
                     op_line = int(m.group('op_line'), 0)
                 if m.group('op_addr'):
                     op_addr = int(m.group('op_addr'), 0)

                 if (m.group('op_special')
                         or m.group('op_copy')
                         or m.group('op_end')):
                     file = os.path.abspath(files.get(op_file, '?'))
                     lines.append((file, op_line, op_addr))
                     line_at.append((op_addr, file, op_line))

                 if m.group('op_end'):
                     op_file = 1
                     op_line = 1
                     op_addr = 0
     proc.wait()
     if proc.returncode != 0:
         if not args.get('verbose'):
             for line in proc.stderr:
                 sys.stdout.write(line)
         # assume no debug-info on failure
         pass

     # sort and keep first when duplicates
     lines.sort()
     lines_ = []
     for file, line, addr in lines:
         if len(lines_) == 0 or lines_[-1][0] != file or lines[-1][1] != line:
             lines_.append((file, line, addr))
     lines = lines_

     # sort and keep first when duplicates
     line_at.sort()
     line_at_ = []
     for addr, file, line in line_at:
         if len(line_at_) == 0 or line_at_[-1][0] != addr:
             line_at_.append((addr, file, line))
     line_at = line_at_

     return syms, sym_at, lines, line_at


 def collect_decompressed(path, *,
         perf_path=PERF_PATH,
         sources=None,
         everything=False,
         propagate=0,
         depth=1,
         **args):
     sample_pattern = re.compile(
         '(?P<comm>\w+)'
         '\s+(?P<pid>\w+)'
         '\s+(?P<time>[\w.]+):'
         '\s*(?P<period>\w+)'
         '\s+(?P<event>[^:]+):')
     frame_pattern = re.compile(
         '\s+(?P<addr>\w+)'
         '\s+(?P<sym>[^\s\+]+)(?:\+(?P<off>\w+))?'
         '\s+\((?P<dso>[^\)]+)\)')
     events = {
         'cycles':           'cycles',
         'branch-misses':    'bmisses',
         'branches':         'branches',
         'cache-misses':     'cmisses',
         'cache-references': 'caches'}

     # note perf_path may contain extra args
     cmd = perf_path + [
         'script',
         '-i%s' % path]
     if args.get('verbose'):
         print(' '.join(shlex.quote(c) for c in cmd))
     proc = sp.Popen(cmd,
         stdout=sp.PIPE,
         stderr=sp.PIPE if not args.get('verbose') else None,
         universal_newlines=True,
         errors='replace',
         close_fds=False)

     last_filtered = False
     last_event = ''
     last_period = 0
     last_stack = []
     deltas = co.defaultdict(lambda: {})
     syms_ = co.defaultdict(lambda: {})
     at_cache = {}
     results = {}

     def commit():
         # tail-recursively propagate measurements
         for i in range(len(last_stack)):
             results_ = results
             for j in reversed(range(i+1)):
                 if i+1-j > depth:
                     break

                 # propagate
                 name = last_stack[j]
                 if name not in results_:
                     results_[name] = (co.defaultdict(lambda: 0), {})
                 results_[name][0][last_event] += last_period

                 # recurse
                 results_ = results_[name][1]

     for line in proc.stdout:
         # we need to process a lot of data, so wait to use regex as late
         # as possible
         if not line.startswith('\t'):
             if last_filtered:
                 commit()
             last_filtered = False

             if line:
                 m = sample_pattern.match(line)
                 if m and m.group('event') in events:
                     last_filtered = True
                     last_event = m.group('event')
                     last_period = int(m.group('period'), 0)
                     last_stack = []

         elif last_filtered:
             m = frame_pattern.match(line)
             if m:
                 # filter out internal/kernel functions
                 if not everything and (
                         m.group('sym').startswith('__')
                         or m.group('sym').startswith('0')
                         or m.group('sym').startswith('-')
                         or m.group('sym').startswith('[')
                         or m.group('dso').startswith('/usr/lib')):
                     continue

                 dso = m.group('dso')
                 sym = m.group('sym')
                 off = int(m.group('off'), 0) if m.group('off') else 0
                 addr_ = int(m.group('addr'), 16)

                 # get the syms/lines for the dso, this is cached
                 syms, sym_at, lines, line_at = collect_syms_and_lines(
                     dso,
                     **args)

                 # ASLR is tricky, we have symbols+offsets, but static symbols
                 # means we may have multiple options for each symbol.
                 #
                 # To try to solve this, we use previous seen symbols to build
                 # confidence for the correct ASLR delta. This means we may
                 # guess incorrectly for early symbols, but this will only affect
                 # a few samples.
                 if sym in syms:
                     sym_addr_ = addr_ - off

                     # track possible deltas?
                     for sym_addr, size in syms[sym]:
                         delta = sym_addr - sym_addr_
                         if delta not in deltas[dso]:
                             deltas[dso][delta] = sum(
                                 abs(a_+delta - a)
                                 for s, (a_, _) in syms_[dso].items()
                                 for a, _ in syms[s])
                     for delta in deltas[dso].keys():
                         deltas[dso][delta] += abs(sym_addr_+delta - sym_addr)
                     syms_[dso][sym] = sym_addr_, size

                     # guess the best delta
                     delta, _ = min(deltas[dso].items(),
                         key=lambda x: (x[1], x[0]))
                     addr = addr_ + delta

                     # cached?
                     if (dso,addr) in at_cache:
                         cached = at_cache[(dso,addr)]
                         if cached is None:
                             # cache says to skip
                             continue
                         file, line = cached
                     else:
                         # find file+line
                         i = bisect.bisect(line_at, addr, key=lambda x: x[0])
                         if i > 0:
                             _, file, line = line_at[i-1]
                         else:
                             file, line = re.sub('(\.o)?$', '.c', dso, 1), 0

                         # ignore filtered sources
                         if sources is not None:
                             if not any(
                                     os.path.abspath(file) == os.path.abspath(s)
                                     for s in sources):
                                 at_cache[(dso,addr)] = None
                                 continue
                         else:
                             # default to only cwd
                             if not everything and not os.path.commonpath([
                                     os.getcwd(),
                                     os.path.abspath(file)]) == os.getcwd():
                                 at_cache[(dso,addr)] = None
                                 continue

                         # simplify path
                         if os.path.commonpath([
                                 os.getcwd(),
                                 os.path.abspath(file)]) == os.getcwd():
                             file = os.path.relpath(file)
                         else:
                             file = os.path.abspath(file)

                         at_cache[(dso,addr)] = file, line
                 else:
                     file, line = re.sub('(\.o)?$', '.c', dso, 1), 0

                 last_stack.append((file, sym, line))

                 # stop propogating?
                 if propagate and len(last_stack) >= propagate:
                     commit()
                     last_filtered = False
     if last_filtered:
         commit()

     proc.wait()
     if proc.returncode != 0:
         if not args.get('verbose'):
             for line in proc.stderr:
                 sys.stdout.write(line)
         sys.exit(-1)

     # rearrange results into result type
     def to_results(results):
         results_ = []
         for name, (r, children) in results.items():
             results_.append(PerfResult(*name,
                 **{events[k]: v for k, v in r.items()},
                 children=to_results(children)))
         return results_

     return to_results(results)

 def collect_job(path, i, **args):
     # decompress into a temporary file, this is to work around
     # some limitations of perf
     with zipfile.ZipFile(path) as z:
         with z.open(i) as f:
             with tempfile.NamedTemporaryFile('wb') as g:
                 shutil.copyfileobj(f, g)
                 g.flush()

                 return collect_decompressed(g.name, **args)

 def starapply(args):
     f, args, kwargs = args
     return f(*args, **kwargs)

 def collect(perf_paths, *,
         jobs=None,
         **args):
     # automatic job detection?
     if jobs == 0:
         jobs = len(os.sched_getaffinity(0))

     records = []
     for path in perf_paths:
         # each .perf file is actually a zip file containing perf files from
         # multiple runs
         with zipfile.ZipFile(path) as z:
             records.extend((path, i) for i in z.infolist())

     # we're dealing with a lot of data but also surprisingly
     # parallelizable
     if jobs is not None:
         results = []
         with mp.Pool(jobs) as p:
             for results_ in p.imap_unordered(
                     starapply,
                     ((collect_job, (path, i), args) for path, i in records)):
                 results.extend(results_)
     else:
         results = []
         for path, i in records:
             results.extend(collect_job(path, i, **args))

     return results


 def fold(Result, results, *,
         by=None,
         defines=None,
         **_):
     if by is None:
         by = Result._by

     for k in it.chain(by or [], (k for k, _ in defines or [])):
         if k not in Result._by and k not in Result._fields:
             print("error: could not find field %r?" % k)
             sys.exit(-1)

     # filter by matching defines
     if defines is not None:
         results_ = []
         for r in results:
             if all(getattr(r, k) in vs for k, vs in defines):
                 results_.append(r)
         results = results_

     # organize results into conflicts
     folding = co.OrderedDict()
     for r in results:
         name = tuple(getattr(r, k) for k in by)
         if name not in folding:
             folding[name] = []
         folding[name].append(r)

     # merge conflicts
     folded = []
     for name, rs in folding.items():
         folded.append(sum(rs[1:], start=rs[0]))

     # fold recursively
     folded_ = []
     for r in folded:
         folded_.append(r._replace(children=fold(
             Result, r.children,
             by=by,
             defines=defines)))
     folded = folded_

     return folded

 def table(Result, results, diff_results=None, *,
         by=None,
         fields=None,
         sort=None,
         summary=False,
         all=False,
         percent=False,
         depth=1,
         **_):
     all_, all = all, __builtins__.all

     if by is None:
         by = Result._by
     if fields is None:
         fields = Result._fields
     types = Result._types

     # fold again
     results = fold(Result, results, by=by)
     if diff_results is not None:
         diff_results = fold(Result, diff_results, by=by)

     # organize by name
     table = {
         ','.join(str(getattr(r, k) or '') for k in by): r
         for r in results}
     diff_table = {
         ','.join(str(getattr(r, k) or '') for k in by): r
         for r in diff_results or []}
     names = list(table.keys() | diff_table.keys())

     # sort again, now with diff info, note that python's sort is stable
     names.sort()
     if diff_results is not None:
         names.sort(key=lambda n: tuple(
             types[k].ratio(
                 getattr(table.get(n), k, None),
                 getattr(diff_table.get(n), k, None))
             for k in fields),
             reverse=True)
     if sort:
         for k, reverse in reversed(sort):
             names.sort(
                 key=lambda n: tuple(
                     (getattr(table[n], k),)
                     if getattr(table.get(n), k, None) is not None else ()
                     for k in ([k] if k else [
                         k for k in Result._sort if k in fields])),
                 reverse=reverse ^ (not k or k in Result._fields))


     # build up our lines
     lines = []

     # header
     header = []
     header.append('%s%s' % (
         ','.join(by),
         ' (%d added, %d removed)' % (
             sum(1 for n in table if n not in diff_table),
             sum(1 for n in diff_table if n not in table))
             if diff_results is not None and not percent else '')
         if not summary else '')
     if diff_results is None:
         for k in fields:
             header.append(k)
     elif percent:
         for k in fields:
             header.append(k)
     else:
         for k in fields:
             header.append('o'+k)
         for k in fields:
             header.append('n'+k)
         for k in fields:
             header.append('d'+k)
     header.append('')
     lines.append(header)

     def table_entry(name, r, diff_r=None, ratios=[]):
         entry = []
         entry.append(name)
         if diff_results is None:
             for k in fields:
                 entry.append(getattr(r, k).table()
                     if getattr(r, k, None) is not None
                     else types[k].none)
         elif percent:
             for k in fields:
                 entry.append(getattr(r, k).diff_table()
                     if getattr(r, k, None) is not None
                     else types[k].diff_none)
         else:
             for k in fields:
                 entry.append(getattr(diff_r, k).diff_table()
                     if getattr(diff_r, k, None) is not None
                     else types[k].diff_none)
             for k in fields:
                 entry.append(getattr(r, k).diff_table()
                     if getattr(r, k, None) is not None
                     else types[k].diff_none)
             for k in fields:
                 entry.append(types[k].diff_diff(
                         getattr(r, k, None),
                         getattr(diff_r, k, None)))
         if diff_results is None:
             entry.append('')
         elif percent:
             entry.append(' (%s)' % ', '.join(
                 '+∞%' if t == +m.inf
                 else '-∞%' if t == -m.inf
                 else '%+.1f%%' % (100*t)
                 for t in ratios))
         else:
             entry.append(' (%s)' % ', '.join(
                     '+∞%' if t == +m.inf
                     else '-∞%' if t == -m.inf
                     else '%+.1f%%' % (100*t)
                     for t in ratios
                     if t)
                 if any(ratios) else '')
         return entry

     # entries
     if not summary:
         for name in names:
             r = table.get(name)
             if diff_results is None:
                 diff_r = None
                 ratios = None
             else:
                 diff_r = diff_table.get(name)
                 ratios = [
                     types[k].ratio(
                         getattr(r, k, None),
                         getattr(diff_r, k, None))
                     for k in fields]
                 if not all_ and not any(ratios):
                     continue
             lines.append(table_entry(name, r, diff_r, ratios))

     # total
     r = next(iter(fold(Result, results, by=[])), None)
     if diff_results is None:
         diff_r = None
         ratios = None
     else:
         diff_r = next(iter(fold(Result, diff_results, by=[])), None)
         ratios = [
             types[k].ratio(
                 getattr(r, k, None),
                 getattr(diff_r, k, None))
             for k in fields]
     lines.append(table_entry('TOTAL', r, diff_r, ratios))

     # find the best widths, note that column 0 contains the names and column -1
     # the ratios, so those are handled a bit differently
     widths = [
         ((max(it.chain([w], (len(l[i]) for l in lines)))+1+4-1)//4)*4-1
         for w, i in zip(
             it.chain([23], it.repeat(7)),
             range(len(lines[0])-1))]

     # adjust the name width based on the expected call depth, though
     # note this doesn't really work with unbounded recursion
     if not summary and not m.isinf(depth):
         widths[0] += 4*(depth-1)

     # print the tree recursively
     print('%-*s  %s%s' % (
         widths[0], lines[0][0],
         ' '.join('%*s' % (w, x)
             for w, x in zip(widths[1:], lines[0][1:-1])),
         lines[0][-1]))

     if not summary:
         def recurse(results_, depth_, prefixes=('', '', '', '')):
             # rebuild our tables at each layer
             table_ = {
                 ','.join(str(getattr(r, k) or '') for k in by): r
                 for r in results_}
             names_ = list(table_.keys())

             # sort again at each layer, keep in mind the numbers are
             # changing as we descend
             names_.sort()
             if sort:
                 for k, reverse in reversed(sort):
                     names_.sort(
                         key=lambda n: tuple(
                             (getattr(table_[n], k),)
                             if getattr(table_.get(n), k, None) is not None
                             else ()
                             for k in ([k] if k else [
                                 k for k in Result._sort if k in fields])),
                         reverse=reverse ^ (not k or k in Result._fields))

             for i, name in enumerate(names_):
                 r = table_[name]
                 is_last = (i == len(names_)-1)

                 print('%s%-*s  %s' % (
                     prefixes[0+is_last],
                     widths[0] - (
                         len(prefixes[0+is_last])
                         if not m.isinf(depth) else 0),
                     name,
                     ' '.join('%*s' % (w, x)
                         for w, x in zip(
                             widths[1:],
                             table_entry(name, r)[1:]))))

                 # recurse?
                 if depth_ > 1:
                     recurse(
                         r.children,
                         depth_-1,
                         (prefixes[2+is_last] + "|-> ",
                          prefixes[2+is_last] + "'-> ",
                          prefixes[2+is_last] + "|   ",
                          prefixes[2+is_last] + "    "))

         # we have enough going on with diffing to make the top layer
         # a special case
         for name, line in zip(names, lines[1:-1]):
             print('%-*s  %s%s' % (
                 widths[0], line[0],
                 ' '.join('%*s' % (w, x)
                     for w, x in zip(widths[1:], line[1:-1])),
                 line[-1]))

             if name in table and depth > 1:
                 recurse(
                     table[name].children,
                     depth-1,
                     ("|-> ",
                      "'-> ",
                      "|   ",
                      "    "))

     print('%-*s  %s%s' % (
         widths[0], lines[-1][0],
         ' '.join('%*s' % (w, x)
             for w, x in zip(widths[1:], lines[-1][1:-1])),
         lines[-1][-1]))


 def annotate(Result, results, *,
         annotate=None,
         threshold=None,
         branches=False,
         caches=False,
         **args):
     # figure out the threshold
     if threshold is None:
         t0, t1 = THRESHOLD
     elif len(threshold) == 1:
         t0, t1 = threshold[0], threshold[0]
     else:
         t0, t1 = threshold
     t0, t1 = min(t0, t1), max(t0, t1)

     if not branches and not caches:
         tk = 'cycles'
     elif branches:
         tk = 'bmisses'
     else:
         tk = 'cmisses'

     # find max cycles
     max_ = max(it.chain((float(getattr(r, tk)) for r in results), [1]))

     for path in co.OrderedDict.fromkeys(r.file for r in results).keys():
         # flatten to line info
         results = fold(Result, results, by=['file', 'line'])
         table = {r.line: r for r in results if r.file == path}

         # calculate spans to show
         if not annotate:
             spans = []
             last = None
             func = None
             for line, r in sorted(table.items()):
                 if float(getattr(r, tk)) / max_ >= t0:
                     if last is not None and line - last.stop <= args['context']:
                         last = range(
                             last.start,
                             line+1+args['context'])
                     else:
                         if last is not None:
                             spans.append((last, func))
                         last = range(
                             line-args['context'],
                             line+1+args['context'])
                         func = r.function
             if last is not None:
                 spans.append((last, func))

         with open(path) as f:
             skipped = False
             for i, line in enumerate(f):
                 # skip lines not in spans?
                 if not annotate and not any(i+1 in s for s, _ in spans):
                     skipped = True
                     continue

                 if skipped:
                     skipped = False
                     print('%s@@ %s:%d: %s @@%s' % (
                         '\x1b[36m' if args['color'] else '',
                         path,
                         i+1,
                         next(iter(f for _, f in spans)),
                         '\x1b[m' if args['color'] else ''))

                 # build line
                 if line.endswith('\n'):
                     line = line[:-1]

                 r = table.get(i+1)
                 if r is not None and (
                         float(r.cycles) > 0
                         if not branches and not caches
                         else float(r.bmisses) > 0 or float(r.branches) > 0
                         if branches
                         else float(r.cmisses) > 0 or float(r.caches) > 0):
                     line = '%-*s // %s' % (
                         args['width'],
                         line,
                         '%s cycles' % r.cycles
                         if not branches and not caches
                         else '%s bmisses, %s branches' % (r.bmisses, r.branches)
                         if branches
                         else '%s cmisses, %s caches' % (r.cmisses, r.caches))

                     if args['color']:
                         if float(getattr(r, tk)) / max_ >= t1:
                             line = '\x1b[1;31m%s\x1b[m' % line
                         elif float(getattr(r, tk)) / max_ >= t0:
                             line = '\x1b[35m%s\x1b[m' % line

                 print(line)


 def report(perf_paths, *,
         by=None,
         fields=None,
         defines=None,
         sort=None,
         branches=False,
         caches=False,
         **args):
     # figure out what color should be
     if args.get('color') == 'auto':
         args['color'] = sys.stdout.isatty()
     elif args.get('color') == 'always':
         args['color'] = True
     else:
         args['color'] = False

     # depth of 0 == m.inf
     if args.get('depth') == 0:
         args['depth'] = m.inf

     # find sizes
     if not args.get('use', None):
         results = collect(perf_paths, **args)
     else:
         results = []
         with openio(args['use']) as f:
             reader = csv.DictReader(f, restval='')
             for r in reader:
                 if not any('perf_'+k in r and r['perf_'+k].strip()
                         for k in PerfResult._fields):
                     continue
                 try:
                     results.append(PerfResult(
                         **{k: r[k] for k in PerfResult._by
                             if k in r and r[k].strip()},
                         **{k: r['perf_'+k] for k in PerfResult._fields
                             if 'perf_'+k in r and r['perf_'+k].strip()}))
                 except TypeError:
                     pass

     # fold
     results = fold(PerfResult, results, by=by, defines=defines)

     # sort, note that python's sort is stable
     results.sort()
     if sort:
         for k, reverse in reversed(sort):
             results.sort(
                 key=lambda r: tuple(
                     (getattr(r, k),) if getattr(r, k) is not None else ()
                     for k in ([k] if k else PerfResult._sort)),
                 reverse=reverse ^ (not k or k in PerfResult._fields))

     # write results to CSV
     if args.get('output'):
         with openio(args['output'], 'w') as f:
             writer = csv.DictWriter(f,
                 (by if by is not None else PerfResult._by)
                 + ['perf_'+k for k in (
                     fields if fields is not None else PerfResult._fields)])
             writer.writeheader()
             for r in results:
                 writer.writerow(
                     {k: getattr(r, k) for k in (
                         by if by is not None else PerfResult._by)}
                     | {'perf_'+k: getattr(r, k) for k in (
                         fields if fields is not None else PerfResult._fields)})

     # find previous results?
     if args.get('diff'):
         diff_results = []
         try:
             with openio(args['diff']) as f:
                 reader = csv.DictReader(f, restval='')
                 for r in reader:
                     if not any('perf_'+k in r and r['perf_'+k].strip()
                             for k in PerfResult._fields):
                         continue
                     try:
                         diff_results.append(PerfResult(
                             **{k: r[k] for k in PerfResult._by
                                 if k in r and r[k].strip()},
                             **{k: r['perf_'+k] for k in PerfResult._fields
                                 if 'perf_'+k in r and r['perf_'+k].strip()}))
                     except TypeError:
                         pass
         except FileNotFoundError:
             pass

         # fold
         diff_results = fold(PerfResult, diff_results, by=by, defines=defines)

     # print table
     if not args.get('quiet'):
         if args.get('annotate') or args.get('threshold'):
             # annotate sources
             annotate(PerfResult, results,
                 branches=branches,
                 caches=caches,
                 **args)
         else:
             # print table
             table(PerfResult, results,
                 diff_results if args.get('diff') else None,
                 by=by if by is not None else ['function'],
                 fields=fields if fields is not None
                     else ['cycles'] if not branches and not caches
                     else ['bmisses', 'branches'] if branches
                     else ['cmisses', 'caches'],
                 sort=sort,
                 **args)


 def main(**args):
     if args.get('record'):
         return record(**args)
     else:
         return report(**args)


 if __name__ == "__main__":
     import argparse
     import sys

     # bit of a hack, but parse_intermixed_args and REMAINDER are
     # incompatible, so we need to figure out what we want before running
     # argparse
     if '-R' in sys.argv or '--record' in sys.argv:
         nargs = argparse.REMAINDER
     else:
         nargs = '*'

     argparse.ArgumentParser._handle_conflict_ignore = lambda *_: None
     argparse._ArgumentGroup._handle_conflict_ignore = lambda *_: None
     parser = argparse.ArgumentParser(
         description="Aggregate and report Linux perf results.",
         allow_abbrev=False,
         conflict_handler='ignore')
     parser.add_argument(
         'perf_paths',
         nargs=nargs,
         help="Input *.perf files.")
     parser.add_argument(
         '-v', '--verbose',
         action='store_true',
         help="Output commands that run behind the scenes.")
     parser.add_argument(
         '-q', '--quiet',
         action='store_true',
         help="Don't show anything, useful with -o.")
     parser.add_argument(
         '-o', '--output',
         help="Specify CSV file to store results.")
     parser.add_argument(
         '-u', '--use',
         help="Don't parse anything, use this CSV file.")
     parser.add_argument(
         '-d', '--diff',
         help="Specify CSV file to diff against.")
     parser.add_argument(
         '-a', '--all',
         action='store_true',
         help="Show all, not just the ones that changed.")
     parser.add_argument(
         '-p', '--percent',
         action='store_true',
         help="Only show percentage change, not a full diff.")
     parser.add_argument(
         '-b', '--by',
         action='append',
         choices=PerfResult._by,
         help="Group by this field.")
     parser.add_argument(
         '-f', '--field',
         dest='fields',
         action='append',
         choices=PerfResult._fields,
         help="Show this field.")
     parser.add_argument(
         '-D', '--define',
         dest='defines',
         action='append',
         type=lambda x: (lambda k,v: (k, set(v.split(','))))(*x.split('=', 1)),
         help="Only include results where this field is this value.")
     class AppendSort(argparse.Action):
         def __call__(self, parser, namespace, value, option):
             if namespace.sort is None:
                 namespace.sort = []
             namespace.sort.append((value, True if option == '-S' else False))
     parser.add_argument(
         '-s', '--sort',
         nargs='?',
         action=AppendSort,
         help="Sort by this field.")
     parser.add_argument(
         '-S', '--reverse-sort',
         nargs='?',
         action=AppendSort,
         help="Sort by this field, but backwards.")
     parser.add_argument(
         '-Y', '--summary',
         action='store_true',
         help="Only show the total.")
     parser.add_argument(
         '-F', '--source',
         dest='sources',
         action='append',
         help="Only consider definitions in this file. Defaults to anything "
             "in the current directory.")
     parser.add_argument(
         '--everything',
         action='store_true',
         help="Include builtin and libc specific symbols.")
     parser.add_argument(
         '--branches',
         action='store_true',
         help="Show branches and branch misses.")
     parser.add_argument(
         '--caches',
         action='store_true',
         help="Show cache accesses and cache misses.")
     parser.add_argument(
         '-P', '--propagate',
         type=lambda x: int(x, 0),
         help="Depth to propagate samples up the call-stack. 0 propagates up "
             "to the entry point, 1 does no propagation. Defaults to 0.")
     parser.add_argument(
         '-Z', '--depth',
         nargs='?',
         type=lambda x: int(x, 0),
         const=0,
         help="Depth of function calls to show. 0 shows all calls but may not "
             "terminate!")
     parser.add_argument(
         '-A', '--annotate',
         action='store_true',
         help="Show source files annotated with coverage info.")
     parser.add_argument(
         '-T', '--threshold',
         nargs='?',
         type=lambda x: tuple(float(x) for x in x.split(',')),
         const=THRESHOLD,
         help="Show lines with samples above this threshold as a percent of "
             "all lines. Defaults to %s." % ','.join(str(t) for t in THRESHOLD))
     parser.add_argument(
         '-c', '--context',
         type=lambda x: int(x, 0),
         default=3,
         help="Show n additional lines of context. Defaults to 3.")
     parser.add_argument(
         '-W', '--width',
         type=lambda x: int(x, 0),
         default=80,
         help="Assume source is styled with this many columns. Defaults to 80.")
     parser.add_argument(
         '--color',
         choices=['never', 'always', 'auto'],
         default='auto',
         help="When to use terminal colors. Defaults to 'auto'.")
     parser.add_argument(
         '-j', '--jobs',
         nargs='?',
         type=lambda x: int(x, 0),
         const=0,
         help="Number of processes to use. 0 spawns one process per core.")
     parser.add_argument(
         '--perf-path',
         type=lambda x: x.split(),
         help="Path to the perf executable, may include flags. "
             "Defaults to %r." % PERF_PATH)
     parser.add_argument(
         '--objdump-path',
         type=lambda x: x.split(),
         default=OBJDUMP_PATH,
         help="Path to the objdump executable, may include flags. "
             "Defaults to %r." % OBJDUMP_PATH)

     # record flags
     record_parser = parser.add_argument_group('record options')
     record_parser.add_argument(
         'command',
         nargs=nargs,
         help="Command to run.")
     record_parser.add_argument(
         '-R', '--record',
         action='store_true',
         help="Run a command and aggregate perf measurements.")
     record_parser.add_argument(
         '-o', '--output',
         help="Output file. Uses flock to synchronize. This is stored as a "
             "zip-file of multiple perf results.")
     record_parser.add_argument(
         '--perf-freq',
         help="perf sampling frequency. This is passed directly to perf. "
             "Defaults to %r." % PERF_FREQ)
     record_parser.add_argument(
         '--perf-period',
         help="perf sampling period. This is passed directly to perf.")
     record_parser.add_argument(
         '--perf-events',
         help="perf events to record. This is passed directly to perf. "
             "Defaults to %r." % PERF_EVENTS)
     record_parser.add_argument(
         '--perf-path',
         type=lambda x: x.split(),
         help="Path to the perf executable, may include flags. "
             "Defaults to %r." % PERF_PATH)

     # avoid intermixed/REMAINDER conflict, see above
     if nargs == argparse.REMAINDER:
         args = parser.parse_args()
     else:
         args = parser.parse_intermixed_args()

     # perf_paths/command overlap, so need to do some munging here
     args.command = args.perf_paths
     if args.record:
         if not args.command:
             print('error: no command specified?')
             sys.exit(-1)
         if not args.output:
             print('error: no output file specified?')
             sys.exit(-1)

     sys.exit(main(**{k: v
         for k, v in vars(args).items()
         if v is not None}))