#!/usr/bin/python3 import sys import os import csv import tempfile import subprocess import glob import pandas as pd """ This script combines vect data obtained from benchmarking specified in argv[1] and outputs the result to file specified as 2nd arg. The results dir has roughly following structure: results/ node/ save..temps/ compressed statistics files To find compressed stats files, we glob recursively for */save.*/*.tar.xz and after extracting, add bmk name to each entry. For eg, if stats file for 400.perlbench contained: 175 vect "Vectorized loops" "f1" 2 175 vect "Vectorized loops" "f2" 3 then in results.csv, the output would be: 400.perlbench, [.] f1, 2 400.perlbench, [.] f2, 3. [.] is prepended to each symbol name to make it's output compatible with perf. """ """ gcc_parse_dump_file will parse stats dump files obtained from -fdump-statistics, and return a dict symbol -> num_vect_loops """ def gcc_parse_dump_file(stats_dump_file): symbol_to_num_vect_loops = dict() with open(stats_dump_file, "r") as fp: lines = fp.readlines() for line in lines: line = line.rstrip() if "Vectorized loops" in line: components = line.split(" ") symbol = components[-2][1: -1] # Strip quotes around symbol name num_vect_loops = int(components[-1]) symbol_to_num_vect_loops[symbol] = num_vect_loops return symbol_to_num_vect_loops """ Read CSV dumpfile and return by casting it to dict. We patch LLVM to ensure that the csv file already has correct format. """ def llvm_parse_dump_file(dump_file): df = pd.read_csv(dump_file) return dict(df.values) def main(): assert len(sys.argv) == 3 results_dir = sys.argv[1] out_csv_file = sys.argv[2] # If the passed argument ends with "/", remove it. if results_dir.endswith("/"): results_dir = results_dir[:-1] outf = open(out_csv_file, "w") csvwriter = csv.writer(outf) csvwriter.writerow(("benchmark", "symbol", "num_vect_loops")) benchmark_files = glob.glob("{0}/**/save.*/*.tar.xz".format(results_dir), recursive=True) if len(benchmark_files) == 0: return for bmk_path in benchmark_files: bmk = os.path.basename(bmk_path) # Strip ".tar.xz" benchmark_name = bmk[0:-7] tmp_dir = tempfile.mkdtemp() subprocess.run(["tar", "xf", bmk_path, "-C", tmp_dir]) curr_dir = os.getcwd() os.chdir(tmp_dir) compiler = "gcc" stats_files = glob.glob("**/*.statistics", recursive=True) if len(stats_files) == 0: stats_files = glob.glob("**/*.vect.csv", recursive=True) compiler = "llvm" # Accumulate num_vect_loops for symbols having same name across the benchmark # FIXME: This is a kludge to deal with static symbols in different TU's but # having same name sym_to_vect = dict() for stats_file in stats_files: parse_fn = "{0}_parse_dump_file".format(compiler) dump_info = globals()[parse_fn](stats_file) for symbol in dump_info.keys(): num_vect_loops = dump_info[symbol] if symbol in sym_to_vect: sym_to_vect[symbol] += num_vect_loops else: sym_to_vect[symbol] = num_vect_loops # Accumulate total number of vectorized loops across the benchmark total_num_vect_loops = 0 for symbol in sym_to_vect.keys(): total_num_vect_loops += sym_to_vect[symbol] # Get exe name. # Benchmark name format is . # and exe name is _base.default # Exceptions for sphinx3 and xalancbmk as done below: exe_name = benchmark_name.split(".")[1] if exe_name == "sphinx3": exe_name = "sphinx_livepretend" elif exe_name == "xalancbmk": exe_name = "Xalan" csvwriter.writerow((benchmark_name, exe_name + "_base.default", total_num_vect_loops)) for symbol in sym_to_vect.keys(): long_symbol = "[.] " + symbol num_vect_loops = sym_to_vect[symbol] csvwriter.writerow((benchmark_name, long_symbol, num_vect_loops)) os.chdir(curr_dir) if __name__ == "__main__": main()