diff options
author | Bernard Ogden <bernie.ogden@linaro.org> | 2015-04-09 16:13:07 +0000 |
---|---|---|
committer | Bernard Ogden <bernie.ogden@linaro.org> | 2015-04-09 16:13:07 +0000 |
commit | 95d55a8a16857e096c4ba442bc9b72ad10ef7067 (patch) | |
tree | cb3d4a83916a0fcd76a663c79994109e71dbe3d5 | |
parent | d36862901dfbdd4e89b0aa1ad8b22d5635cbcf3a (diff) |
Profile the business part of the glibc malloc microbenchmark
-rw-r--r-- | bench-malloc-thread.gdb | 103 | ||||
-rwxr-xr-x | bench-malloc-threadgun.sh | 58 | ||||
-rwxr-xr-x | bench-malloc-threadpistol.sh | 38 |
3 files changed, 199 insertions, 0 deletions
diff --git a/bench-malloc-thread.gdb b/bench-malloc-thread.gdb new file mode 100644 index 0000000..6e25b88 --- /dev/null +++ b/bench-malloc-thread.gdb @@ -0,0 +1,103 @@ +#This script quite sensitive to OS differences. On some targets it doesn't work at all. +#I don't really trust the process handling. + +python + +import os +import sys +import subprocess +import signal +import locale +import time +locale.setlocale(locale.LC_ALL, 'en_GB.UTF8') + +if not 'PERF_COUNTERS' in os.environ or os.environ['PERF_COUNTERS'] == '': + os.environ['PERF_COUNTERS']='task-clock context-switches cpu-migrations page-faults cycles instructions branch-misses' +counter_list=os.environ['PERF_COUNTERS'].split() +counter_results=dict.fromkeys(counter_list, 0) + +end + +set follow-fork-mode child + +#doesn't seem to work properly +set print inferior-events off + +break *(do_benchmark+0x1d8) +commands + silent + python + +try: + #This is more complex than comma-separating a single -e, but gives clearer errors when one event is unsupported + args = ['/usr/bin/perf', 'stat', '-p', str(gdb.selected_inferior().pid)] + for x in counter_list: + args.extend(['-e', x]) + + perfproc = subprocess.Popen(args, stderr=subprocess.PIPE) + time.sleep(1) #Yuck. Need to give perf a moment to start up, though. +except OSError, e: + if e.errno == errno.ENOENT: + sys.stderr.write("*** /usr/bin/perf does not exist\n") + else: + sys.stderr.write("*** Error %d while executing %s\n" % (e.errno, ' '.join(args))) + raise +except: + sys.stderr.write("*** Error while executing %s\n" % ' '.join(args)) + raise + + end + c +end + +break *(do_benchmark+0x1e4) +commands + silent + python + +try: + perfproc.send_signal(signal.SIGINT) #Can this kill random processes? One would hope that Popen is sensible in this respect, but I don't know that it is. We don't expect perf to have exited in the normal case, though. + stdoutdata, stderrdata = perfproc.communicate() + lines = stderrdata.decode('utf-8').splitlines() + #sys.stderr.write('\n'.join(lines) + '\n') + for line in lines[3:-3]: + words = line.split() + counter_results[words[1]] += locale.atof(words[0]) +except OSError, e: #On newer pythons, would ProcessLookupError - but that's a subclass of OSError, so this should be close enough + if e.errno == errno.ESRCH: + sys.stderr.write("*** Failed to kill perf process %d\n" % perfproc.pid) + stdoutdata, stderrdata = perfproc.communicate() + if stdoutdata != None: + sys.stdout.write(stdoutdata.decode('utf-8') + '\n') + if stderrdata != None: + sys.stderr.write(stderrdata.decode('utf-8') + '\n') + if perfproc.returncode == None: + sys.stderr.write("*** perf process still appears to be running\n") + else: + sys.stderr.write("*** perf process exited early with code %d\n" % perfproc.returncode) + raise + + end + c +end + +#Work around set print inferior-events off not working properly +break exit +commands + silent + #This works in principle, but gdb seems buggy + #python gdb.execute('detach inferior %s' % str(gdb.selected_inferior().num)) + + #This works in practice for this case + #detach inferior 2 + python + +for counter in counter_list: + sys.stderr.write(counter + ' ' + locale.str(counter_results[counter]) + '\n') + + end + c +end + +run +q diff --git a/bench-malloc-threadgun.sh b/bench-malloc-threadgun.sh new file mode 100755 index 0000000..5ba4d04 --- /dev/null +++ b/bench-malloc-threadgun.sh @@ -0,0 +1,58 @@ +#!/bin/bash +#Simple example running on ls +set -o pipefail +set -u + +#Typically one would adjust these things at the top +GDBSCRIPT=bench-malloc-thread.gdb +BINARIES='/home/bernie.ogden/build/toolchain-upstream_sysroot/builds/x86_64-unknown-linux-gnu/aarch64-linux-gnu/glibc.git/benchtests/bench-malloc-thread /home/bernie.ogden/build/toolchain-bernie_sysroot/builds/x86_64-unknown-linux-gnu/aarch64-linux-gnu/glibc.git/benchtests/bench-malloc-thread /home/bernie.ogden/build/toolchain-unatomic_sysroot/builds/x86_64-unknown-linux-gnu/aarch64-linux-gnu/glibc.git/benchtests/bench-malloc-thread' +BINARY_ARGS='1' +OFFLOAD_MASK=0x1 #Core to run all non-profiled tasks on +declare -A CORE_COUNTER +#All perf events in A53 TRM, associated with taskset mask for an A53 on Juno +CORE_COUNTER[0x02]=A53_COUNTERS[@] +A53_COUNTERS=(\'\' \ + \'r00 r01 r02 r03 r04 r05\' \ + \'r06 r07 r08 r09 r0A r0B\' \ + \'r0C r0D r0E r0F r10 r11\' \ + \'r12 r13 r14 r15 r16 r17\' \ + \'r18 r19 r1A r1D r1E r60\' \ + \'r61 r7A r86 r87 rC0 rC1\' \ + \'rC2 rC3 rC4 rC5 rC6 rC7\' \ + \'rC8 rC9 rCA rCB rCC rD0\' \ + \'rD1 rD2 rE0 rE1 rE2 rE3\' \ + \'rE4 rE5 rE6 rE7 rE8\') +#All perf events in A57 TRM, associated with taskset mask for an A57 on Juno +CORE_COUNTER[0x10]=A57_COUNTERS[@] +A57_COUNTERS=(\'\' \ + \'r00 r01 r02 r03 r04 r05\' \ + \'r08 r09 r0A r0B r10 r11\' \ + \'r12 r13 r14 r15 r16 r17\' \ + \'r18 r19 r1A r1B r1C r1D\' \ + \'r1E r40 r41 r42 r43 r46\' \ + \'r47 r48 r4C r4D r50 r51\' \ + \'r52 r53 r56 r57 r58 r60\' \ + \'r61 r62 r63 r64 r65 r66\' \ + \'r67 r68 r69 r6A r6C r6D\' \ + \'r6E r70 r71 r72 r73 r74\' \ + \'r75 r76 r77 r78 r79 r7A\' \ + \'r7C r7D r7E r81 r82 r83\' \ + \'r84 r86 r87 r88 r8A r8B\' \ + \'r8C r8D r8E r8F r90 r91\') +function doit +{ + for i in {1..30}; do #Number of times to run each condition. 30 in some vague hope of statistical validity. + echo : $binary $binary_args $i | tee -a ~/output.$$.${cpu} + taskset $cpu gdb -q -x ${GDBSCRIPT} --args $binary $binary_args 2>&1 >/dev/null | tee -a ~/output.$$.${cpu} + done +} + +for cpu in "${!CORE_COUNTER[@]}"; do #This is the outer loop so that we stay on a single CPU for as long as possible + for x in `ps -e | awk '{print $1}' | sed 1d`; do sudo taskset -p ${OFFLOAD_MASK} $x; done #Shunt as much as we can off the CPU of interest (i.e. onto a CPU that isn't in any mask that we are using) + rm -f ${HOME}/output.$$.${cpu} #Delete output from old run, should it exist + for binary in $BINARIES; do + for binary_args in $BINARY_ARGS; do + eval for PERF_COUNTERS in ${!CORE_COUNTER[${cpu}]}\; do export PERF_COUNTERS\; doit\; done + done + done +done diff --git a/bench-malloc-threadpistol.sh b/bench-malloc-threadpistol.sh new file mode 100755 index 0000000..d7dd38d --- /dev/null +++ b/bench-malloc-threadpistol.sh @@ -0,0 +1,38 @@ +#!/bin/bash +#Simple example running on ls +set -o pipefail +set -u + +#Typically one would adjust these things at the top +GDBSCRIPT=bench-malloc-thread.gdb +BINARIES='/home/bernie.ogden/build/toolchain-upstream_sysroot/builds/x86_64-unknown-linux-gnu/aarch64-linux-gnu/glibc.git/benchtests/bench-malloc-thread /home/bernie.ogden/build/toolchain-bernie_sysroot/builds/x86_64-unknown-linux-gnu/aarch64-linux-gnu/glibc.git/benchtests/bench-malloc-thread /home/bernie.ogden/build/toolchain-unatomic_sysroot/builds/x86_64-unknown-linux-gnu/aarch64-linux-gnu/glibc.git/benchtests/bench-malloc-thread' +BINARY_ARGS='1' +OFFLOAD_MASK=0x1 #Core to run all non-profiled tasks on +declare -A CORE_COUNTER +#All perf events in A53 TRM, associated with taskset mask for an A53 on Juno +CORE_COUNTER[0x02]=A53_COUNTERS[@] +A53_COUNTERS=(\'\' \ + \'r00 r01 r02 r03 r04 r05\' \ + \'rE4 rE5 rE6 rE7 rE8\') +#All perf events in A57 TRM, associated with taskset mask for an A57 on Juno +CORE_COUNTER[0x10]=A57_COUNTERS[@] +A57_COUNTERS=(\'\' \ + \'r00 r01 r02 r03 r04 r05\' \ + \'r8C r8D r8E r8F r90 r91\') +function doit +{ + for i in {1..1}; do #Number of times to run each condition. 30 in some vague hope of statistical validity. + echo : $binary $binary_args $i | tee -a ~/output.$$.${cpu} + taskset $cpu gdb -q -x ${GDBSCRIPT} --args $binary $binary_args 2>&1 >/dev/null | tee -a ~/output.$$.${cpu} + done +} + +for cpu in "${!CORE_COUNTER[@]}"; do #This is the outer loop so that we stay on a single CPU for as long as possible + for x in `ps -e | awk '{print $1}' | sed 1d`; do sudo taskset -p ${OFFLOAD_MASK} $x; done #Shunt as much as we can off the CPU of interest (i.e. onto a CPU that isn't in any mask that we are using) + rm -f ${HOME}/output.$$.${cpu} #Delete output from old run, should it exist + for binary in $BINARIES; do + for binary_args in $BINARY_ARGS; do + eval for PERF_COUNTERS in ${!CORE_COUNTER[${cpu}]}\; do export PERF_COUNTERS\; doit\; done + done + done +done |