#!/usr/bin/python3


import sys
import os
import csv
import tempfile
import subprocess
import glob
import pandas as pd

"""
This script combines vect data obtained from benchmarking specified in argv[1] and outputs
the result to file specified as 2nd arg.
The results dir has roughly following structure:
results/
  node/
    save.<node-config>.temps/
      compressed statistics files

To find compressed stats files, we glob recursively for */save.*/*.tar.xz
and after extracting, add bmk name to each entry.
For eg, if stats file for 400.perlbench contained:
175 vect "Vectorized loops" "f1" 2
175 vect "Vectorized loops" "f2" 3

then in results.csv, the output would be:
400.perlbench, [.] f1, 2
400.perlbench, [.] f2, 3.
[.] is prepended to each symbol name to make it's output compatible
with perf.
"""

"""
gcc_parse_dump_file will parse stats dump files obtained from -fdump-statistics,
and return a dict symbol -> num_vect_loops
"""
def gcc_parse_dump_file(stats_dump_file):
    symbol_to_num_vect_loops = dict()
    with open(stats_dump_file, "r") as fp:
        lines = fp.readlines()
        for line in lines:
            line = line.rstrip()
            if "Vectorized loops" in line:
                components = line.split(" ")
                symbol = components[-2][1: -1] # Strip quotes around symbol name
                num_vect_loops = int(components[-1])
                symbol_to_num_vect_loops[symbol] = num_vect_loops
    return symbol_to_num_vect_loops

"""
Read CSV dumpfile and return by casting it to dict.
We patch LLVM to ensure that the csv file already has correct format. """
def llvm_parse_dump_file(dump_file):
    df = pd.read_csv(dump_file)
    return dict(df.values)

def main():
    assert len(sys.argv) == 3
    results_dir = sys.argv[1]
    out_csv_file = sys.argv[2]

    # If the passed argument ends with "/", remove it.
    if results_dir.endswith("/"):
        results_dir = results_dir[:-1]

    outf = open(out_csv_file, "w")
    csvwriter = csv.writer(outf)
    csvwriter.writerow(("benchmark", "symbol", "num_vect_loops"))

    benchmark_files = glob.glob("{0}/**/save.*/*.tar.xz".format(results_dir), recursive=True)
    if len(benchmark_files) == 0:
        return

    for bmk_path in benchmark_files:
        bmk = os.path.basename(bmk_path)
        # Strip ".tar.xz"
        benchmark_name = bmk[0:-7]

        tmp_dir = tempfile.mkdtemp()
        subprocess.run(["tar", "xf", bmk_path, "-C", tmp_dir])

        curr_dir = os.getcwd()
        os.chdir(tmp_dir)
        compiler = "gcc"
        stats_files = glob.glob("**/*.statistics", recursive=True)
        if len(stats_files) == 0:
            stats_files = glob.glob("**/*.vect.csv", recursive=True)
            compiler = "llvm"

        # Accumulate num_vect_loops for symbols having same name across the benchmark
        # FIXME: This is a kludge to deal with static symbols in different TU's but
        # having same name
        sym_to_vect = dict()
        for stats_file in stats_files:
            parse_fn = "{0}_parse_dump_file".format(compiler)
            dump_info = globals()[parse_fn](stats_file)
            for symbol in dump_info.keys():
                num_vect_loops = dump_info[symbol]
                if symbol in sym_to_vect:
                    sym_to_vect[symbol] += num_vect_loops
                else:
                    sym_to_vect[symbol] = num_vect_loops

        # Accumulate total number of vectorized loops across the benchmark
        total_num_vect_loops = 0
        for symbol in sym_to_vect.keys():
            total_num_vect_loops += sym_to_vect[symbol]

        # Get exe name.
        # Benchmark name format is <num>.<name>
        # and exe name is <name>_base.default
        # Exceptions for sphinx3 and xalancbmk as done below:

        exe_name = benchmark_name.split(".")[1]
        if exe_name == "sphinx3":
            exe_name = "sphinx_livepretend"
        elif exe_name == "xalancbmk":
            exe_name = "Xalan"
        csvwriter.writerow((benchmark_name, exe_name + "_base.default", total_num_vect_loops))

        for symbol in sym_to_vect.keys():
            long_symbol = "[.] " + symbol
            num_vect_loops = sym_to_vect[symbol]
            csvwriter.writerow((benchmark_name, long_symbol, num_vect_loops))
        os.chdir(curr_dir)


if __name__ == "__main__":
    main()