diff options
author | Michael Kruse <llvm@meinersbur.de> | 2018-09-26 15:22:39 +0000 |
---|---|---|
committer | Michael Kruse <llvm@meinersbur.de> | 2018-09-26 15:22:39 +0000 |
commit | a0b013b9bf51745bd7b0c5fc2e851ef3610ab50a (patch) | |
tree | 5bd99169062e193705320e6de9c65f40246a925c /polly | |
parent | a2735c72a60b8cc6920574991051531c0874f58b (diff) |
[doc] Fix HowToManuallyUseTheIndividualPiecesOfPolly
Also remove compiled binaries.
Diffstat (limited to 'polly')
39 files changed, 2571 insertions, 1979 deletions
diff --git a/polly/docs/HowToManuallyUseTheIndividualPiecesOfPolly.rst b/polly/docs/HowToManuallyUseTheIndividualPiecesOfPolly.rst index 1822923c288..958dc96fa9e 100644 --- a/polly/docs/HowToManuallyUseTheIndividualPiecesOfPolly.rst +++ b/polly/docs/HowToManuallyUseTheIndividualPiecesOfPolly.rst @@ -21,7 +21,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - clang -S -emit-llvm matmul.c -o matmul.s + clang -S -emit-llvm matmul.c -Xclang -disable-O0-optnone -o matmul.ll 2. **Prepare the LLVM-IR for Polly** @@ -34,7 +34,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - opt -S -polly-canonicalize matmul.s > matmul.preopt.ll + opt -S -polly-canonicalize matmul.ll -o matmul.preopt.ll 3. **Show the SCoPs detected by Polly (optional)** -------------------------------------------------- @@ -45,7 +45,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt -polly-ast -analyze -q matmul.preopt.ll -polly-process-unprofitable + $ opt -basicaa -polly-ast -analyze matmul.preopt.ll -polly-process-unprofitable -polly-use-llvm-names .. code-block:: guess @@ -84,8 +84,8 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt -view-scops -disable-output matmul.preopt.ll - $ opt -view-scops-only -disable-output matmul.preopt.ll + $ opt -polly-use-llvm-names -basicaa -view-scops -disable-output matmul.preopt.ll + $ opt -polly-use-llvm-names -basicaa -view-scops-only -disable-output matmul.preopt.ll The output for the different functions: @@ -104,7 +104,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt -polly-scops -analyze matmul.preopt.ll -polly-process-unprofitable + $ opt -polly-use-llvm-names -basicaa -polly-scops -analyze matmul.preopt.ll -polly-process-unprofitable .. code-block:: guess @@ -194,7 +194,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt -polly-dependences -analyze matmul.preopt.ll -polly-process-unprofitable + $ opt -basicaa -polly-use-llvm-names -polly-dependences -analyze matmul.preopt.ll -polly-process-unprofitable .. code-block:: guess @@ -226,7 +226,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt -polly-export-jscop matmul.preopt.ll -polly-process-unprofitable + $ opt -basicaa -polly-use-llvm-names -polly-export-jscop matmul.preopt.ll -polly-process-unprofitable .. code-block:: guess @@ -254,7 +254,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt matmul.preopt.ll -polly-import-jscop -polly-ast -analyze -polly-process-unprofitable + $ opt -basicaa -polly-use-llvm-names matmul.preopt.ll -polly-import-jscop -polly-ast -analyze -polly-process-unprofitable .. code-block:: c @@ -282,7 +282,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged -polly-ast -analyze -polly-process-unprofitable + $ opt -basicaa -polly-use-llvm-names matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged -polly-ast -analyze -polly-process-unprofitable .. code-block:: c @@ -311,7 +311,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-ast -analyze -polly-process-unprofitable + $ opt -basicaa -polly-use-llvm-names matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-ast -analyze -polly-process-unprofitable .. code-block:: c @@ -346,7 +346,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-ast -analyze -polly-process-unprofitable + $ opt -basicaa -polly-use-llvm-names matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-ast -analyze -polly-process-unprofitable .. code-block:: c @@ -383,11 +383,11 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt matmul.preopt.ll | opt -O3 > matmul.normalopt.ll + $ opt -S matmul.preopt.ll | opt -S -O3 -o matmul.normalopt.ll .. code-block:: console - $ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged -polly-codegen -polly-process-unprofitable | opt -O3 > matmul.polly.interchanged.ll + $ opt -S matmul.preopt.ll -basicaa -polly-use-llvm-names -polly-import-jscop -polly-import-jscop-postfix=interchanged -polly-codegen -polly-process-unprofitable | opt -S -O3 -o matmul.polly.interchanged.ll .. code-block:: guess @@ -397,7 +397,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-codegen -polly-process-unprofitable | opt -O3 > matmul.polly.interchanged+tiled.ll + $ opt -S matmul.preopt.ll -basicaa -polly-use-llvm-names -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-codegen -polly-process-unprofitable | opt -S -O3 -o matmul.polly.interchanged+tiled.ll .. code-block:: guess @@ -407,7 +407,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen -polly-vectorizer=polly -polly-process-unprofitable | opt -O3 > matmul.polly.interchanged+tiled+vector.ll + $ opt -S matmul.preopt.ll -basicaa -polly-use-llvm-names -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen -polly-vectorizer=polly -polly-process-unprofitable | opt -S -O3 -o matmul.polly.interchanged+tiled+vector.ll .. code-block:: guess @@ -417,7 +417,7 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen -polly-vectorizer=polly -polly-parallel -polly-process-unprofitable | opt -O3 > matmul.polly.interchanged+tiled+openmp.ll + $ opt -S matmul.preopt.ll -basicaa -polly-use-llvm-names -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen -polly-vectorizer=polly -polly-parallel -polly-process-unprofitable | opt -S -O3 -o matmul.polly.interchanged+tiled+openmp.ll .. code-block:: guess @@ -431,11 +431,16 @@ performance improvement can be expected by an optimal automatic optimizer. .. code-block:: console - $ llc matmul.normalopt.ll -o matmul.normalopt.s && gcc matmul.normalopt.s -o matmul.normalopt.exe - $ llc matmul.polly.interchanged.ll -o matmul.polly.interchanged.s && gcc matmul.polly.interchanged.s -o matmul.polly.interchanged.exe - $ llc matmul.polly.interchanged+tiled.ll -o matmul.polly.interchanged+tiled.s && gcc matmul.polly.interchanged+tiled.s -o matmul.polly.interchanged+tiled.exe - $ llc matmul.polly.interchanged+tiled+vector.ll -o matmul.polly.interchanged+tiled+vector.s && gcc matmul.polly.interchanged+tiled+vector.s -o matmul.polly.interchanged+tiled+vector.exe - $ llc matmul.polly.interchanged+tiled+vector+openmp.ll -o matmul.polly.interchanged+tiled+vector+openmp.s && gcc -fopenmp matmul.polly.interchanged+tiled+vector+openmp.s -o matmul.polly.interchanged+tiled+vector+openmp.exe + $ llc matmul.normalopt.ll -o matmul.normalopt.s -relocation-model=pic + $ gcc matmul.normalopt.s -o matmul.normalopt.exe + $ llc matmul.polly.interchanged.ll -o matmul.polly.interchanged.s -relocation-model=pic + $ gcc matmul.polly.interchanged.s -o matmul.polly.interchanged.exe + $ llc matmul.polly.interchanged+tiled.ll -o matmul.polly.interchanged+tiled.s -relocation-model=pic + $ gcc matmul.polly.interchanged+tiled.s -o matmul.polly.interchanged+tiled.exe + $ llc matmul.polly.interchanged+tiled+vector.ll -o matmul.polly.interchanged+tiled+vector.s -relocation-model=pic + $ gcc matmul.polly.interchanged+tiled+vector.s -o matmul.polly.interchanged+tiled+vector.exe + $ llc matmul.polly.interchanged+tiled+vector+openmp.ll -o matmul.polly.interchanged+tiled+vector+openmp.s -relocation-model=pic + $ gcc matmul.polly.interchanged+tiled+vector+openmp.s -lgomp -o matmul.polly.interchanged+tiled+vector+openmp.exe 11. **Compare the runtime of the executables** ---------------------------------------------- diff --git a/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop index 7f1db3e9e4b..2cc32b1cc01 100644 --- a/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop +++ b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop @@ -1,33 +1,39 @@ { - "arrays" : [ + "arrays": [ { - "name" : "MemRef_A", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_A", + "sizes": [ + "*", + "1536" + ], + "type": "float" }, { - "name" : "MemRef_B", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_B", + "sizes": [ + "*", + "1536" + ], + "type": "float" } ], - "context" : "{ : }", - "name" : "%for.cond1.preheader---%for.end19", - "statements" : [ + "context": "{ : }", + "name": "%for.cond1.preheader---%for.end19", + "statements": [ { - "accesses" : [ + "accesses": [ { - "kind" : "write", - "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[i0, i1] }" + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_A[i0, i1] }" }, { - "kind" : "write", - "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_B[i0, i1] }" + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_B[i0, i1] }" } ], - "domain" : "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", - "name" : "Stmt_for_body3", - "schedule" : "{ Stmt_for_body3[i0, i1] -> [i0, i1] }" + "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", + "name": "Stmt_for_body3", + "schedule": "{ Stmt_for_body3[i0, i1] -> [i0, i1] }" } ] -} +}
\ No newline at end of file diff --git a/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged new file mode 100644 index 00000000000..2cc32b1cc01 --- /dev/null +++ b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged @@ -0,0 +1,39 @@ +{ + "arrays": [ + { + "name": "MemRef_A", + "sizes": [ + "*", + "1536" + ], + "type": "float" + }, + { + "name": "MemRef_B", + "sizes": [ + "*", + "1536" + ], + "type": "float" + } + ], + "context": "{ : }", + "name": "%for.cond1.preheader---%for.end19", + "statements": [ + { + "accesses": [ + { + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_A[i0, i1] }" + }, + { + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_B[i0, i1] }" + } + ], + "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", + "name": "Stmt_for_body3", + "schedule": "{ Stmt_for_body3[i0, i1] -> [i0, i1] }" + } + ] +}
\ No newline at end of file diff --git a/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged+tiled b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged+tiled new file mode 100644 index 00000000000..2cc32b1cc01 --- /dev/null +++ b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged+tiled @@ -0,0 +1,39 @@ +{ + "arrays": [ + { + "name": "MemRef_A", + "sizes": [ + "*", + "1536" + ], + "type": "float" + }, + { + "name": "MemRef_B", + "sizes": [ + "*", + "1536" + ], + "type": "float" + } + ], + "context": "{ : }", + "name": "%for.cond1.preheader---%for.end19", + "statements": [ + { + "accesses": [ + { + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_A[i0, i1] }" + }, + { + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_B[i0, i1] }" + } + ], + "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", + "name": "Stmt_for_body3", + "schedule": "{ Stmt_for_body3[i0, i1] -> [i0, i1] }" + } + ] +}
\ No newline at end of file diff --git a/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged+tiled+vector b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged+tiled+vector new file mode 100644 index 00000000000..2cc32b1cc01 --- /dev/null +++ b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged+tiled+vector @@ -0,0 +1,39 @@ +{ + "arrays": [ + { + "name": "MemRef_A", + "sizes": [ + "*", + "1536" + ], + "type": "float" + }, + { + "name": "MemRef_B", + "sizes": [ + "*", + "1536" + ], + "type": "float" + } + ], + "context": "{ : }", + "name": "%for.cond1.preheader---%for.end19", + "statements": [ + { + "accesses": [ + { + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_A[i0, i1] }" + }, + { + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_B[i0, i1] }" + } + ], + "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", + "name": "Stmt_for_body3", + "schedule": "{ Stmt_for_body3[i0, i1] -> [i0, i1] }" + } + ] +}
\ No newline at end of file diff --git a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop index ac1a908db1f..2e4b597fb7c 100644 --- a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop +++ b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop @@ -1,57 +1,66 @@ { - "arrays" : [ + "arrays": [ { - "name" : "MemRef_C", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_C", + "sizes": [ + "*", + "1536" + ], + "type": "float" }, { - "name" : "MemRef_A", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_A", + "sizes": [ + "*", + "1536" + ], + "type": "float" }, { - "name" : "MemRef_B", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_B", + "sizes": [ + "*", + "1536" + ], + "type": "float" } ], - "context" : "{ : }", - "name" : "%for.cond1.preheader---%for.end30", - "statements" : [ + "context": "{ : }", + "name": "%for.cond1.preheader---%for.end30", + "statements": [ { - "accesses" : [ + "accesses": [ { - "kind" : "write", - "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }" + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }" } ], - "domain" : "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", - "name" : "Stmt_for_body3", - "schedule" : "{ Stmt_for_body3[i0, i1] -> [i0, i1, 0, 0] }" + "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", + "name": "Stmt_for_body3", + "schedule": "{ Stmt_for_body3[i0, i1] -> [i0, i1, 0, 0] }" }, { - "accesses" : [ + "accesses": [ { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" }, { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }" }, { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }" }, { - "kind" : "write", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" + "kind": "write", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" } ], - "domain" : "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }", - "name" : "Stmt_for_body8", - "schedule" : "{ Stmt_for_body8[i0, i1, i2] -> [i0, i1, 1, i2] }" + "domain": "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }", + "name": "Stmt_for_body8", + "schedule": "{ Stmt_for_body8[i0, i1, i2] -> [i0, i1, 1, i2] }" } ] -} +}
\ No newline at end of file diff --git a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged index 7e3d212b4ef..fc45fa1dc47 100644 --- a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged +++ b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged @@ -1,57 +1,66 @@ { - "arrays" : [ + "arrays": [ { - "name" : "MemRef_C", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_C", + "sizes": [ + "*", + "1536" + ], + "type": "float" }, { - "name" : "MemRef_A", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_A", + "sizes": [ + "*", + "1536" + ], + "type": "float" }, { - "name" : "MemRef_B", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_B", + "sizes": [ + "*", + "1536" + ], + "type": "float" } ], - "context" : "{ : }", - "name" : "%for.cond1.preheader---%for.end30", - "statements" : [ + "context": "{ : }", + "name": "%for.cond1.preheader---%for.end30", + "statements": [ { - "accesses" : [ + "accesses": [ { - "kind" : "write", - "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }" + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }" } ], - "domain" : "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", - "name" : "Stmt_for_body3", - "schedule" : "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0] }" + "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", + "name": "Stmt_for_body3", + "schedule": "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0] }" }, { - "accesses" : [ + "accesses": [ { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" }, { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }" }, { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }" }, { - "kind" : "write", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" + "kind": "write", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" } ], - "domain" : "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }", - "name" : "Stmt_for_body8", - "schedule" : "{ Stmt_for_body8[i0, i1, i2] -> [1, i0, i2, i1] }" + "domain": "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }", + "name": "Stmt_for_body8", + "schedule": "{ Stmt_for_body8[i0, i1, i2] -> [1, i0, i2, i1] }" } ] -} +}
\ No newline at end of file diff --git a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled index b7c33b9896c..32617959177 100644 --- a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled +++ b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled @@ -1,57 +1,66 @@ { - "arrays" : [ + "arrays": [ { - "name" : "MemRef_C", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_C", + "sizes": [ + "*", + "1536" + ], + "type": "float" }, { - "name" : "MemRef_A", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_A", + "sizes": [ + "*", + "1536" + ], + "type": "float" }, { - "name" : "MemRef_B", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_B", + "sizes": [ + "*", + "1536" + ], + "type": "float" } ], - "context" : "{ : }", - "name" : "%for.cond1.preheader---%for.end30", - "statements" : [ + "context": "{ : }", + "name": "%for.cond1.preheader---%for.end30", + "statements": [ { - "accesses" : [ + "accesses": [ { - "kind" : "write", - "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }" + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }" } ], - "domain" : "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", - "name" : "Stmt_for_body3", - "schedule" : "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0, 0, 0, 0 ] }" + "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", + "name": "Stmt_for_body3", + "schedule": "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0, 0, 0, 0 ] }" }, { - "accesses" : [ + "accesses": [ { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" }, { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }" }, { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }" }, { - "kind" : "write", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" + "kind": "write", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" } ], - "domain" : "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }", - "name" : "Stmt_for_body8", - "schedule" : "{ Stmt_for_body8[i0, i1, i2] -> [1, o0, o1, o2, i0, i2, i1]: o0 <= i0 < o0 + 64 and o1 <= i1 < o1 + 64 and o2 <= i2 < o2 + 64 and o0 % 64 = 0 and o1 % 64 = 0 and o2 % 64 = 0 }" + "domain": "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }", + "name": "Stmt_for_body8", + "schedule": "{ Stmt_for_body8[i0, i1, i2] -> [1, o0, o1, o2, i0, i2, i1]: o0 <= i0 < o0 + 64 and o1 <= i1 < o1 + 64 and o2 <= i2 < o2 + 64 and o0 % 64 = 0 and o1 % 64 = 0 and o2 % 64 = 0 }" } ] -} +}
\ No newline at end of file diff --git a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled+vector b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled+vector index 0f588aa8de8..d7a872a4e35 100644 --- a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled+vector +++ b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled+vector @@ -1,57 +1,66 @@ { - "arrays" : [ + "arrays": [ { - "name" : "MemRef_C", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_C", + "sizes": [ + "*", + "1536" + ], + "type": "float" }, { - "name" : "MemRef_A", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_A", + "sizes": [ + "*", + "1536" + ], + "type": "float" }, { - "name" : "MemRef_B", - "sizes" : [ "1536" ], - "type" : "float" + "name": "MemRef_B", + "sizes": [ + "*", + "1536" + ], + "type": "float" } ], - "context" : "{ : }", - "name" : "%for.cond1.preheader---%for.end30", - "statements" : [ + "context": "{ : }", + "name": "%for.cond1.preheader---%for.end30", + "statements": [ { - "accesses" : [ + "accesses": [ { - "kind" : "write", - "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }" + "kind": "write", + "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }" } ], - "domain" : "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", - "name" : "Stmt_for_body3", - "schedule" : "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0, 0, 0, 0, 0 ] }" + "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }", + "name": "Stmt_for_body3", + "schedule": "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0, 0, 0, 0, 0 ] }" }, { - "accesses" : [ + "accesses": [ { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" }, { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }" }, { - "kind" : "read", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }" + "kind": "read", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }" }, { - "kind" : "write", - "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" + "kind": "write", + "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }" } ], - "domain" : "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }", - "name" : "Stmt_for_body8", - "schedule" : "{ Stmt_for_body8[i0, i1, i2] -> [1, o0, o1, o2, i0, i2, oo1, i1]: o0 <= i0 < o0 + 64 and o1 <= oo1 < o1 + 64 and o2 <= i2 < o2 + 64 and oo1 <= i1 < oo1 + 4 and o0 % 64 = 0 and o1 % 64 = 0 and o2 % 64 = 0 and oo1 % 4 = 0 }" + "domain": "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }", + "name": "Stmt_for_body8", + "schedule": "{ Stmt_for_body8[i0, i1, i2] -> [1, o0, o1, o2, i0, i2, oo1, i1]: o0 <= i0 < o0 + 64 and o1 <= oo1 < o1 + 64 and o2 <= i2 < o2 + 64 and oo1 <= i1 < oo1 + 4 and o0 % 64 = 0 and o1 % 64 = 0 and o2 % 64 = 0 and oo1 % 4 = 0 }" } ] -} +}
\ No newline at end of file diff --git a/polly/docs/experiments/matmul/matmul.s b/polly/docs/experiments/matmul/matmul.ll index 17147be2447..f8918bd2ca6 100644 --- a/polly/docs/experiments/matmul/matmul.s +++ b/polly/docs/experiments/matmul/matmul.ll @@ -6,15 +6,15 @@ target triple = "x86_64-unknown-linux-gnu" %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } %struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } -@A = common global [1536 x [1536 x float]] zeroinitializer, align 16 -@B = common global [1536 x [1536 x float]] zeroinitializer, align 16 -@stdout = external global %struct._IO_FILE*, align 8 +@A = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16 +@B = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16 +@stdout = external dso_local global %struct._IO_FILE*, align 8 @.str = private unnamed_addr constant [5 x i8] c"%lf \00", align 1 -@C = common global [1536 x [1536 x float]] zeroinitializer, align 16 +@C = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16 @.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 -; Function Attrs: nounwind uwtable -define void @init_array() #0 { +; Function Attrs: noinline nounwind uwtable +define dso_local void @init_array() #0 { entry: %i = alloca i32, align 4 %j = alloca i32, align 4 @@ -44,12 +44,12 @@ for.body3: ; preds = %for.cond1 %conv = sitofp i32 %add to double %div = fdiv double %conv, 2.000000e+00 %conv4 = fptrunc double %div to float - %4 = load i32, i32* %j, align 4 + %4 = load i32, i32* %i, align 4 %idxprom = sext i32 %4 to i64 - %5 = load i32, i32* %i, align 4 + %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %idxprom + %5 = load i32, i32* %j, align 4 %idxprom5 = sext i32 %5 to i64 - %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %idxprom5 - %arrayidx6 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom + %arrayidx6 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom5 store float %conv4, float* %arrayidx6, align 4 %6 = load i32, i32* %i, align 4 %7 = load i32, i32* %j, align 4 @@ -59,12 +59,12 @@ for.body3: ; preds = %for.cond1 %conv10 = sitofp i32 %add9 to double %div11 = fdiv double %conv10, 2.000000e+00 %conv12 = fptrunc double %div11 to float - %8 = load i32, i32* %j, align 4 + %8 = load i32, i32* %i, align 4 %idxprom13 = sext i32 %8 to i64 - %9 = load i32, i32* %i, align 4 - %idxprom14 = sext i32 %9 to i64 - %arrayidx15 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %idxprom14 - %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx15, i64 0, i64 %idxprom13 + %arrayidx14 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %idxprom13 + %9 = load i32, i32* %j, align 4 + %idxprom15 = sext i32 %9 to i64 + %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx14, i64 0, i64 %idxprom15 store float %conv12, float* %arrayidx16, align 4 br label %for.inc @@ -87,8 +87,8 @@ for.end19: ; preds = %for.cond ret void } -; Function Attrs: nounwind uwtable -define void @print_array() #0 { +; Function Attrs: noinline nounwind uwtable +define dso_local void @print_array() #0 { entry: %i = alloca i32, align 4 %j = alloca i32, align 4 @@ -111,12 +111,12 @@ for.cond1: ; preds = %for.inc, %for.body for.body3: ; preds = %for.cond1 %2 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8 - %3 = load i32, i32* %j, align 4 + %3 = load i32, i32* %i, align 4 %idxprom = sext i32 %3 to i64 - %4 = load i32, i32* %i, align 4 + %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom + %4 = load i32, i32* %j, align 4 %idxprom4 = sext i32 %4 to i64 - %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom4 - %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom + %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom4 %5 = load float, float* %arrayidx5, align 4 %conv = fpext float %5 to double %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), double %conv) @@ -154,10 +154,10 @@ for.end12: ; preds = %for.cond ret void } -declare i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1 +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1 -; Function Attrs: nounwind uwtable -define i32 @main() #0 { +; Function Attrs: noinline nounwind uwtable +define dso_local i32 @main() #0 { entry: %retval = alloca i32, align 4 %i = alloca i32, align 4 @@ -185,12 +185,12 @@ for.cond1: ; preds = %for.inc25, %for.bod br i1 %cmp2, label %for.body3, label %for.end27 for.body3: ; preds = %for.cond1 - %2 = load i32, i32* %j, align 4 + %2 = load i32, i32* %i, align 4 %idxprom = sext i32 %2 to i64 - %3 = load i32, i32* %i, align 4 + %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom + %3 = load i32, i32* %j, align 4 %idxprom4 = sext i32 %3 to i64 - %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom4 - %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom + %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom4 store float 0.000000e+00, float* %arrayidx5, align 4 store i32 0, i32* %k, align 4 br label %for.cond6 @@ -201,35 +201,35 @@ for.cond6: ; preds = %for.inc, %for.body3 br i1 %cmp7, label %for.body8, label %for.end for.body8: ; preds = %for.cond6 - %5 = load i32, i32* %j, align 4 + %5 = load i32, i32* %i, align 4 %idxprom9 = sext i32 %5 to i64 - %6 = load i32, i32* %i, align 4 - %idxprom10 = sext i32 %6 to i64 - %arrayidx11 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom10 - %arrayidx12 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx11, i64 0, i64 %idxprom9 + %arrayidx10 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom9 + %6 = load i32, i32* %j, align 4 + %idxprom11 = sext i32 %6 to i64 + %arrayidx12 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx10, i64 0, i64 %idxprom11 %7 = load float, float* %arrayidx12, align 4 - %8 = load i32, i32* %k, align 4 + %8 = load i32, i32* %i, align 4 %idxprom13 = sext i32 %8 to i64 - %9 = load i32, i32* %i, align 4 - %idxprom14 = sext i32 %9 to i64 - %arrayidx15 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %idxprom14 - %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx15, i64 0, i64 %idxprom13 + %arrayidx14 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %idxprom13 + %9 = load i32, i32* %k, align 4 + %idxprom15 = sext i32 %9 to i64 + %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx14, i64 0, i64 %idxprom15 %10 = load float, float* %arrayidx16, align 4 - %11 = load i32, i32* %j, align 4 + %11 = load i32, i32* %k, align 4 %idxprom17 = sext i32 %11 to i64 - %12 = load i32, i32* %k, align 4 - %idxprom18 = sext i32 %12 to i64 - %arrayidx19 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %idxprom18 - %arrayidx20 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx19, i64 0, i64 %idxprom17 + %arrayidx18 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %idxprom17 + %12 = load i32, i32* %j, align 4 + %idxprom19 = sext i32 %12 to i64 + %arrayidx20 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx18, i64 0, i64 %idxprom19 %13 = load float, float* %arrayidx20, align 4 %mul = fmul float %10, %13 %add = fadd float %7, %mul - %14 = load i32, i32* %j, align 4 + %14 = load i32, i32* %i, align 4 %idxprom21 = sext i32 %14 to i64 - %15 = load i32, i32* %i, align 4 - %idxprom22 = sext i32 %15 to i64 - %arrayidx23 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom22 - %arrayidx24 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx23, i64 0, i64 %idxprom21 + %arrayidx22 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom21 + %15 = load i32, i32* %j, align 4 + %idxprom23 = sext i32 %15 to i64 + %arrayidx24 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx22, i64 0, i64 %idxprom23 store float %add, float* %arrayidx24, align 4 br label %for.inc @@ -261,9 +261,11 @@ for.end30: ; preds = %for.cond ret i32 0 } -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -!llvm.ident = !{!0} +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} -!0 = !{!"clang version 4.0.0 (http://llvm.org/git/clang.git 081569d9a29c7bc827b2d41f8e62891bbc895e2f) (http://llvm.org/git/llvm.git e117e506536626352e8e47f6c72cd6e2a276622c)"} +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)"} diff --git a/polly/docs/experiments/matmul/matmul.normalopt.exe b/polly/docs/experiments/matmul/matmul.normalopt.exe Binary files differdeleted file mode 100755 index cdb9e67af45..00000000000 --- a/polly/docs/experiments/matmul/matmul.normalopt.exe +++ /dev/null diff --git a/polly/docs/experiments/matmul/matmul.normalopt.ll b/polly/docs/experiments/matmul/matmul.normalopt.ll Binary files differindex ba792c29f70..8d8a4aa287a 100644 --- a/polly/docs/experiments/matmul/matmul.normalopt.ll +++ b/polly/docs/experiments/matmul/matmul.normalopt.ll diff --git a/polly/docs/experiments/matmul/matmul.normalopt.s b/polly/docs/experiments/matmul/matmul.normalopt.s index 079af702a14..ec4c7b256b9 100644 --- a/polly/docs/experiments/matmul/matmul.normalopt.s +++ b/polly/docs/experiments/matmul/matmul.normalopt.s @@ -1,263 +1,235 @@ - .file "matmul.normalopt.ll" + .text + .file "matmul.c" .section .rodata.cst8,"aM",@progbits,8 - .align 8 + .p2align 3 # -- Begin function init_array .LCPI0_0: .quad 4602678819172646912 # double 0.5 .text .globl init_array - .align 16, 0x90 + .p2align 4, 0x90 .type init_array,@function init_array: # @init_array .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp2: .cfi_def_cfa_offset 16 -.Ltmp3: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp4: .cfi_def_cfa_register %rbp + leaq B(%rip), %rax + leaq A(%rip), %rcx xorl %r8d, %r8d - vmovsd .LCPI0_0(%rip), %xmm0 - .align 16, 0x90 + movsd .LCPI0_0(%rip), %xmm0 # xmm0 = mem[0],zero + xorl %r9d, %r9d + .p2align 4, 0x90 .LBB0_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB0_2 Depth 2 - xorl %ecx, %ecx - .align 16, 0x90 + movl $1, %edi + xorl %edx, %edx + .p2align 4, 0x90 .LBB0_2: # %for.body3 # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 - movl %ecx, %edx - imull %r8d, %edx movl %edx, %esi - sarl $31, %esi - shrl $22, %esi - addl %edx, %esi - andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 - negl %esi - movq %r8, %rax - shlq $11, %rax - leal 1(%rdx,%rsi), %edi - leaq (%rax,%rax,2), %rsi - leaq 1(%rcx), %rdx - cmpq $1536, %rdx # imm = 0x600 - vcvtsi2sdl %edi, %xmm0, %xmm1 - vmulsd %xmm0, %xmm1, %xmm1 - vcvtsd2ss %xmm1, %xmm1, %xmm1 - vmovss %xmm1, A(%rsi,%rcx,4) - vmovss %xmm1, B(%rsi,%rcx,4) - movq %rdx, %rcx + andl $1022, %esi # imm = 0x3FE + orl $1, %esi + xorps %xmm1, %xmm1 + cvtsi2sdl %esi, %xmm1 + mulsd %xmm0, %xmm1 + cvtsd2ss %xmm1, %xmm1 + movss %xmm1, -4(%rcx,%rdi,4) + movss %xmm1, -4(%rax,%rdi,4) + leal (%r9,%rdx), %esi + andl $1023, %esi # imm = 0x3FF + addl $1, %esi + xorps %xmm1, %xmm1 + cvtsi2sdl %esi, %xmm1 + mulsd %xmm0, %xmm1 + cvtsd2ss %xmm1, %xmm1 + movss %xmm1, (%rcx,%rdi,4) + movss %xmm1, (%rax,%rdi,4) + addq $2, %rdi + addl %r8d, %edx + cmpq $1537, %rdi # imm = 0x601 jne .LBB0_2 -# BB#3: # %for.inc17 +# %bb.3: # %for.inc17 # in Loop: Header=BB0_1 Depth=1 - incq %r8 - cmpq $1536, %r8 # imm = 0x600 + addq $1, %r9 + addq $6144, %rax # imm = 0x1800 + addq $6144, %rcx # imm = 0x1800 + addl $2, %r8d + cmpq $1536, %r9 # imm = 0x600 jne .LBB0_1 -# BB#4: # %for.end19 +# %bb.4: # %for.end19 popq %rbp - ret -.Ltmp5: - .size init_array, .Ltmp5-init_array + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end0: + .size init_array, .Lfunc_end0-init_array .cfi_endproc - - .globl print_array - .align 16, 0x90 + # -- End function + .globl print_array # -- Begin function print_array + .p2align 4, 0x90 .type print_array,@function print_array: # @print_array .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp9: .cfi_def_cfa_offset 16 -.Ltmp10: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp11: .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 + pushq %r13 pushq %r12 pushq %rbx -.Ltmp12: - .cfi_offset %rbx, -48 -.Ltmp13: - .cfi_offset %r12, -40 -.Ltmp14: + pushq %rax + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 .cfi_offset %r14, -32 -.Ltmp15: .cfi_offset %r15, -24 - xorl %r14d, %r14d - movl $C, %r15d - .align 16, 0x90 + leaq C(%rip), %r13 + xorl %eax, %eax + movl $3435973837, %r12d # imm = 0xCCCCCCCD + leaq .L.str(%rip), %r14 + .p2align 4, 0x90 .LBB1_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB1_2 Depth 2 - movq stdout(%rip), %rax - movq %r15, %r12 + movq %rax, -48(%rbp) # 8-byte Spill + movq stdout(%rip), %rsi xorl %ebx, %ebx - .align 16, 0x90 + .p2align 4, 0x90 .LBB1_2: # %for.body3 # Parent Loop BB1_1 Depth=1 # => This Inner Loop Header: Depth=2 - vmovss (%r12), %xmm0 - vcvtss2sd %xmm0, %xmm0, %xmm0 - movq %rax, %rdi - movl $.L.str, %esi + movl %ebx, %eax + imulq %r12, %rax + shrq $38, %rax + leal (%rax,%rax,4), %r15d + shll $4, %r15d + addl $79, %r15d + movss (%r13,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + cvtss2sd %xmm0, %xmm0 movb $1, %al + movq %rsi, %rdi + movq %r14, %rsi callq fprintf - movslq %ebx, %rax - imulq $1717986919, %rax, %rcx # imm = 0x66666667 - movq %rcx, %rdx - shrq $63, %rdx - sarq $37, %rcx - addl %edx, %ecx - imull $80, %ecx, %ecx - subl %ecx, %eax - cmpl $79, %eax + cmpl %ebx, %r15d jne .LBB1_4 -# BB#3: # %if.then +# %bb.3: # %if.then # in Loop: Header=BB1_2 Depth=2 movq stdout(%rip), %rsi movl $10, %edi - callq fputc + callq fputc@PLT .LBB1_4: # %for.inc # in Loop: Header=BB1_2 Depth=2 - addq $4, %r12 - incq %rbx - movq stdout(%rip), %rax + addq $1, %rbx + movq stdout(%rip), %rsi cmpq $1536, %rbx # imm = 0x600 jne .LBB1_2 -# BB#5: # %for.end +# %bb.5: # %for.end # in Loop: Header=BB1_1 Depth=1 movl $10, %edi - movq %rax, %rsi - callq fputc - addq $6144, %r15 # imm = 0x1800 - incq %r14 - cmpq $1536, %r14 # imm = 0x600 + callq fputc@PLT + movq -48(%rbp), %rax # 8-byte Reload + addq $1, %rax + addq $6144, %r13 # imm = 0x1800 + cmpq $1536, %rax # imm = 0x600 jne .LBB1_1 -# BB#6: # %for.end12 +# %bb.6: # %for.end12 + addq $8, %rsp popq %rbx popq %r12 + popq %r13 popq %r14 popq %r15 popq %rbp - ret -.Ltmp16: - .size print_array, .Ltmp16-print_array + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end1: + .size print_array, .Lfunc_end1-print_array .cfi_endproc - - .section .rodata.cst8,"aM",@progbits,8 - .align 8 -.LCPI2_0: - .quad 4602678819172646912 # double 0.5 - .text - .globl main - .align 16, 0x90 + # -- End function + .globl main # -- Begin function main + .p2align 4, 0x90 .type main,@function main: # @main .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp19: .cfi_def_cfa_offset 16 -.Ltmp20: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp21: .cfi_def_cfa_register %rbp - xorl %r8d, %r8d - vmovsd .LCPI2_0(%rip), %xmm0 - .align 16, 0x90 -.LBB2_1: # %for.cond1.preheader.i + callq init_array + leaq A(%rip), %rax + xorl %r10d, %r10d + leaq B(%rip), %r8 + leaq C(%rip), %r9 + .p2align 4, 0x90 +.LBB2_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB2_2 Depth 2 - xorl %ecx, %ecx - .align 16, 0x90 -.LBB2_2: # %for.body3.i + # Child Loop BB2_3 Depth 3 + movq %r8, %rsi + xorl %edx, %edx + .p2align 4, 0x90 +.LBB2_2: # %for.body3 # Parent Loop BB2_1 Depth=1 - # => This Inner Loop Header: Depth=2 - movl %ecx, %edx - imull %r8d, %edx - movl %edx, %esi - sarl $31, %esi - shrl $22, %esi - addl %edx, %esi - andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 - negl %esi - movq %r8, %rax - shlq $11, %rax - leal 1(%rdx,%rsi), %edi - leaq (%rax,%rax,2), %rsi - leaq 1(%rcx), %rdx + # => This Loop Header: Depth=2 + # Child Loop BB2_3 Depth 3 + leaq (%r10,%r10,2), %rcx + shlq $11, %rcx + addq %r9, %rcx + leaq (%rcx,%rdx,4), %r11 + movl $0, (%rcx,%rdx,4) + xorps %xmm0, %xmm0 + movl $2, %ecx + movq %rsi, %rdi + .p2align 4, 0x90 +.LBB2_3: # %for.body8 + # Parent Loop BB2_1 Depth=1 + # Parent Loop BB2_2 Depth=2 + # => This Inner Loop Header: Depth=3 + movss -8(%rax,%rcx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero + mulss (%rdi), %xmm1 + movss -4(%rax,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero + addss %xmm0, %xmm1 + mulss 6144(%rdi), %xmm2 + addss %xmm1, %xmm2 + movss (%rax,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + mulss 12288(%rdi), %xmm0 + addss %xmm2, %xmm0 + addq $3, %rcx + addq $18432, %rdi # imm = 0x4800 + cmpq $1538, %rcx # imm = 0x602 + jne .LBB2_3 +# %bb.4: # %for.inc25 + # in Loop: Header=BB2_2 Depth=2 + movss %xmm0, (%r11) + addq $1, %rdx + addq $4, %rsi cmpq $1536, %rdx # imm = 0x600 - vcvtsi2sdl %edi, %xmm0, %xmm1 - vmulsd %xmm0, %xmm1, %xmm1 - vcvtsd2ss %xmm1, %xmm1, %xmm1 - vmovss %xmm1, A(%rsi,%rcx,4) - vmovss %xmm1, B(%rsi,%rcx,4) - movq %rdx, %rcx jne .LBB2_2 -# BB#3: # %for.inc17.i +# %bb.5: # %for.inc28 # in Loop: Header=BB2_1 Depth=1 - incq %r8 - cmpq $1536, %r8 # imm = 0x600 - jne .LBB2_1 -# BB#4: - xorl %r8d, %r8d - movl $A, %r9d - .align 16, 0x90 -.LBB2_5: # %for.cond1.preheader - # =>This Loop Header: Depth=1 - # Child Loop BB2_6 Depth 2 - # Child Loop BB2_7 Depth 3 - leaq (%r8,%r8,2), %rdx - shlq $11, %rdx - leaq C(%rdx), %rsi - xorl %edi, %edi - .align 16, 0x90 -.LBB2_6: # %for.body3 - # Parent Loop BB2_5 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB2_7 Depth 3 - movl $0, (%rsi) - vxorps %xmm0, %xmm0, %xmm0 - movq $-9437184, %rax # imm = 0xFFFFFFFFFF700000 - movq %r9, %rcx - .align 16, 0x90 -.LBB2_7: # %for.body8 - # Parent Loop BB2_5 Depth=1 - # Parent Loop BB2_6 Depth=2 - # => This Inner Loop Header: Depth=3 - vmovss (%rcx), %xmm1 - vmulss B+9437184(%rax,%rdi,4), %xmm1, %xmm1 - vaddss %xmm1, %xmm0, %xmm0 - addq $4, %rcx + addq $1, %r10 addq $6144, %rax # imm = 0x1800 - jne .LBB2_7 -# BB#8: # %for.inc25 - # in Loop: Header=BB2_6 Depth=2 - vmovss %xmm0, (%rsi) - leaq C+4(%rdx,%rdi,4), %rsi - incq %rdi - cmpq $1536, %rdi # imm = 0x600 - jne .LBB2_6 -# BB#9: # %for.inc28 - # in Loop: Header=BB2_5 Depth=1 - addq $6144, %r9 # imm = 0x1800 - incq %r8 - cmpq $1536, %r8 # imm = 0x600 - jne .LBB2_5 -# BB#10: # %for.end30 + cmpq $1536, %r10 # imm = 0x600 + jne .LBB2_1 +# %bb.6: # %for.end30 xorl %eax, %eax popq %rbp - ret -.Ltmp22: - .size main, .Ltmp22-main + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end2: + .size main, .Lfunc_end2-main .cfi_endproc - + # -- End function .type A,@object # @A .comm A,9437184,16 .type B,@object # @B @@ -265,10 +237,11 @@ main: # @main .type .L.str,@object # @.str .section .rodata.str1.1,"aMS",@progbits,1 .L.str: - .asciz "%lf " + .asciz "%lf " .size .L.str, 5 .type C,@object # @C .comm C,9437184,16 + .ident "clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)" .section ".note.GNU-stack","",@progbits diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe Binary files differdeleted file mode 100755 index feb24366d73..00000000000 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe +++ /dev/null diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll Binary files differindex 593794ef380..169f9405bc7 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s index ca87de11704..4e89fd6ea4d 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s @@ -1,743 +1,852 @@ - .file "matmul.polly.interchanged+tiled+vector+openmp.ll" - .section .rodata.cst8,"aM",@progbits,8 - .align 8 -.LCPI0_0: - .quad 4602678819172646912 # double 0.5 .text - .globl init_array - .align 16, 0x90 + .file "matmul.c" + .globl init_array # -- Begin function init_array + .p2align 4, 0x90 .type init_array,@function init_array: # @init_array .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp3: .cfi_def_cfa_offset 16 -.Ltmp4: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp5: .cfi_def_cfa_register %rbp - pushq %r15 - pushq %r14 pushq %rbx - subq $24, %rsp -.Ltmp6: - .cfi_offset %rbx, -40 -.Ltmp7: - .cfi_offset %r14, -32 -.Ltmp8: - .cfi_offset %r15, -24 - leaq -32(%rbp), %rsi - movl $init_array.omp_subfn, %edi + pushq %rax + .cfi_offset %rbx, -24 + leaq init_array_polly_subfn(%rip), %rdi + leaq -16(%rbp), %rbx xorl %edx, %edx xorl %ecx, %ecx movl $1536, %r8d # imm = 0x600 movl $1, %r9d - callq GOMP_parallel_loop_runtime_start - leaq -40(%rbp), %rdi - leaq -48(%rbp), %rsi - callq GOMP_loop_runtime_next - testb %al, %al - je .LBB0_4 -# BB#1: - leaq -40(%rbp), %r14 - leaq -48(%rbp), %r15 - vmovsd .LCPI0_0(%rip), %xmm1 - .align 16, 0x90 -.LBB0_2: # %omp.loadIVBounds.i - # =>This Loop Header: Depth=1 - # Child Loop BB0_8 Depth 2 - # Child Loop BB0_5 Depth 3 - movq -48(%rbp), %r8 - leaq -1(%r8), %rcx - movq -40(%rbp), %rax - cmpq %rcx, %rax - jg .LBB0_3 -# BB#7: # %polly.loop_preheader4.preheader.i - # in Loop: Header=BB0_2 Depth=1 - addq $-2, %r8 - .align 16, 0x90 -.LBB0_8: # %polly.loop_preheader4.i - # Parent Loop BB0_2 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB0_5 Depth 3 - xorl %edx, %edx - .align 16, 0x90 -.LBB0_5: # %polly.loop_header3.i - # Parent Loop BB0_2 Depth=1 - # Parent Loop BB0_8 Depth=2 - # => This Inner Loop Header: Depth=3 - movl %edx, %esi - imull %eax, %esi - movl %esi, %edi - sarl $31, %edi - shrl $22, %edi - addl %esi, %edi - andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 - negl %edi - movq %rax, %rcx - shlq $11, %rcx - leal 1(%rsi,%rdi), %ebx - leaq (%rcx,%rcx,2), %rdi - leaq 1(%rdx), %rsi - cmpq $1536, %rsi # imm = 0x600 - vcvtsi2sdl %ebx, %xmm0, %xmm0 - vmulsd %xmm1, %xmm0, %xmm0 - vcvtsd2ss %xmm0, %xmm0, %xmm0 - vmovss %xmm0, A(%rdi,%rdx,4) - vmovss %xmm0, B(%rdi,%rdx,4) - movq %rsi, %rdx - jne .LBB0_5 -# BB#6: # %polly.loop_exit5.i - # in Loop: Header=BB0_8 Depth=2 - cmpq %r8, %rax - leaq 1(%rax), %rax - jle .LBB0_8 -.LBB0_3: # %omp.checkNext.backedge.i - # in Loop: Header=BB0_2 Depth=1 - movq %r14, %rdi - movq %r15, %rsi - callq GOMP_loop_runtime_next - vmovsd .LCPI0_0(%rip), %xmm1 - testb %al, %al - jne .LBB0_2 -.LBB0_4: # %init_array.omp_subfn.exit - callq GOMP_loop_end_nowait - callq GOMP_parallel_end - addq $24, %rsp + movq %rbx, %rsi + callq GOMP_parallel_loop_runtime_start@PLT + movq %rbx, %rdi + callq init_array_polly_subfn + callq GOMP_parallel_end@PLT + addq $8, %rsp popq %rbx - popq %r14 - popq %r15 popq %rbp - ret -.Ltmp9: - .size init_array, .Ltmp9-init_array + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end0: + .size init_array, .Lfunc_end0-init_array .cfi_endproc - - .globl print_array - .align 16, 0x90 + # -- End function + .globl print_array # -- Begin function print_array + .p2align 4, 0x90 .type print_array,@function print_array: # @print_array .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp13: .cfi_def_cfa_offset 16 -.Ltmp14: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp15: .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 + pushq %r13 pushq %r12 pushq %rbx -.Ltmp16: - .cfi_offset %rbx, -48 -.Ltmp17: - .cfi_offset %r12, -40 -.Ltmp18: + pushq %rax + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 .cfi_offset %r14, -32 -.Ltmp19: .cfi_offset %r15, -24 - xorl %r14d, %r14d - movl $C, %r15d - .align 16, 0x90 + leaq C(%rip), %r13 + xorl %eax, %eax + movl $3435973837, %r12d # imm = 0xCCCCCCCD + leaq .L.str(%rip), %r14 + .p2align 4, 0x90 .LBB1_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB1_2 Depth 2 - movq stdout(%rip), %rax - movq %r15, %r12 + movq %rax, -48(%rbp) # 8-byte Spill + movq stdout(%rip), %rsi xorl %ebx, %ebx - .align 16, 0x90 + .p2align 4, 0x90 .LBB1_2: # %for.body3 # Parent Loop BB1_1 Depth=1 # => This Inner Loop Header: Depth=2 - vmovss (%r12), %xmm0 - vcvtss2sd %xmm0, %xmm0, %xmm0 - movq %rax, %rdi - movl $.L.str, %esi + movl %ebx, %eax + imulq %r12, %rax + shrq $38, %rax + leal (%rax,%rax,4), %r15d + shll $4, %r15d + addl $79, %r15d + movss (%r13,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + cvtss2sd %xmm0, %xmm0 movb $1, %al + movq %rsi, %rdi + movq %r14, %rsi callq fprintf - movslq %ebx, %rax - imulq $1717986919, %rax, %rcx # imm = 0x66666667 - movq %rcx, %rdx - shrq $63, %rdx - sarq $37, %rcx - addl %edx, %ecx - imull $80, %ecx, %ecx - subl %ecx, %eax - cmpl $79, %eax + cmpl %ebx, %r15d jne .LBB1_4 -# BB#3: # %if.then +# %bb.3: # %if.then # in Loop: Header=BB1_2 Depth=2 movq stdout(%rip), %rsi movl $10, %edi - callq fputc + callq fputc@PLT .LBB1_4: # %for.inc # in Loop: Header=BB1_2 Depth=2 - addq $4, %r12 - incq %rbx - movq stdout(%rip), %rax + addq $1, %rbx + movq stdout(%rip), %rsi cmpq $1536, %rbx # imm = 0x600 jne .LBB1_2 -# BB#5: # %for.end +# %bb.5: # %for.end # in Loop: Header=BB1_1 Depth=1 movl $10, %edi - movq %rax, %rsi - callq fputc - addq $6144, %r15 # imm = 0x1800 - incq %r14 - cmpq $1536, %r14 # imm = 0x600 + callq fputc@PLT + movq -48(%rbp), %rax # 8-byte Reload + addq $1, %rax + addq $6144, %r13 # imm = 0x1800 + cmpq $1536, %rax # imm = 0x600 jne .LBB1_1 -# BB#6: # %for.end12 +# %bb.6: # %for.end12 + addq $8, %rsp popq %rbx popq %r12 + popq %r13 popq %r14 popq %r15 popq %rbp - ret -.Ltmp20: - .size print_array, .Ltmp20-print_array + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end1: + .size print_array, .Lfunc_end1-print_array .cfi_endproc - - .globl main - .align 16, 0x90 + # -- End function + .globl main # -- Begin function main + .p2align 4, 0x90 .type main,@function main: # @main .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp24: .cfi_def_cfa_offset 16 -.Ltmp25: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp26: .cfi_def_cfa_register %rbp - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 pushq %rbx - subq $24, %rsp -.Ltmp27: - .cfi_offset %rbx, -56 -.Ltmp28: - .cfi_offset %r12, -48 -.Ltmp29: - .cfi_offset %r13, -40 -.Ltmp30: - .cfi_offset %r14, -32 -.Ltmp31: - .cfi_offset %r15, -24 + pushq %rax + .cfi_offset %rbx, -24 callq init_array - leaq -48(%rbp), %rsi - movl $main.omp_subfn, %edi + leaq main_polly_subfn(%rip), %rdi + leaq -16(%rbp), %rbx xorl %edx, %edx xorl %ecx, %ecx movl $1536, %r8d # imm = 0x600 movl $1, %r9d - callq GOMP_parallel_loop_runtime_start - leaq -56(%rbp), %rdi - leaq -64(%rbp), %rsi - callq GOMP_loop_runtime_next - testb %al, %al - je .LBB2_4 -# BB#1: - leaq -56(%rbp), %r14 - leaq -64(%rbp), %r15 - .align 16, 0x90 -.LBB2_2: # %omp.loadIVBounds.i - # =>This Loop Header: Depth=1 - # Child Loop BB2_6 Depth 2 - movq -64(%rbp), %r12 - leaq -1(%r12), %rcx - movq -56(%rbp), %rax - cmpq %rcx, %rax - jg .LBB2_3 -# BB#5: # %polly.loop_preheader4.preheader.i - # in Loop: Header=BB2_2 Depth=1 - addq $-2, %r12 - leaq (%rax,%rax,2), %rcx - leaq -1(%rax), %r13 - shlq $11, %rcx - leaq C(%rcx), %rbx - .align 16, 0x90 -.LBB2_6: # %polly.loop_preheader4.i - # Parent Loop BB2_2 Depth=1 - # => This Inner Loop Header: Depth=2 - movq %rbx, %rdi - xorl %esi, %esi - movl $6144, %edx # imm = 0x1800 - callq memset - addq $6144, %rbx # imm = 0x1800 - incq %r13 - cmpq %r12, %r13 - jle .LBB2_6 -.LBB2_3: # %omp.checkNext.backedge.i - # in Loop: Header=BB2_2 Depth=1 - movq %r14, %rdi - movq %r15, %rsi - callq GOMP_loop_runtime_next - testb %al, %al - jne .LBB2_2 -.LBB2_4: # %main.omp_subfn.exit - callq GOMP_loop_end_nowait - callq GOMP_parallel_end - leaq -48(%rbp), %rbx - movl $main.omp_subfn1, %edi movq %rbx, %rsi + callq GOMP_parallel_loop_runtime_start@PLT + movq %rbx, %rdi + callq main_polly_subfn + callq GOMP_parallel_end@PLT + leaq main_polly_subfn_1(%rip), %rdi xorl %edx, %edx xorl %ecx, %ecx movl $1536, %r8d # imm = 0x600 movl $64, %r9d - callq GOMP_parallel_loop_runtime_start + movq %rbx, %rsi + callq GOMP_parallel_loop_runtime_start@PLT movq %rbx, %rdi - callq main.omp_subfn1 - callq GOMP_parallel_end + callq main_polly_subfn_1 + callq GOMP_parallel_end@PLT xorl %eax, %eax - addq $24, %rsp + addq $8, %rsp popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 popq %rbp - ret -.Ltmp32: - .size main, .Ltmp32-main + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end2: + .size main, .Lfunc_end2-main .cfi_endproc - + # -- End function .section .rodata.cst8,"aM",@progbits,8 - .align 8 + .p2align 3 # -- Begin function init_array_polly_subfn .LCPI3_0: .quad 4602678819172646912 # double 0.5 .text - .align 16, 0x90 - .type init_array.omp_subfn,@function -init_array.omp_subfn: # @init_array.omp_subfn + .p2align 4, 0x90 + .type init_array_polly_subfn,@function +init_array_polly_subfn: # @init_array_polly_subfn .cfi_startproc -# BB#0: # %omp.setup - pushq %rbp -.Ltmp36: - .cfi_def_cfa_offset 16 -.Ltmp37: - .cfi_offset %rbp, -16 - movq %rsp, %rbp -.Ltmp38: - .cfi_def_cfa_register %rbp +# %bb.0: # %polly.par.setup pushq %r15 + .cfi_def_cfa_offset 16 pushq %r14 + .cfi_def_cfa_offset 24 + pushq %r13 + .cfi_def_cfa_offset 32 + pushq %r12 + .cfi_def_cfa_offset 40 pushq %rbx - subq $24, %rsp -.Ltmp39: - .cfi_offset %rbx, -40 -.Ltmp40: - .cfi_offset %r14, -32 -.Ltmp41: - .cfi_offset %r15, -24 - leaq -32(%rbp), %rdi - leaq -40(%rbp), %rsi - callq GOMP_loop_runtime_next + .cfi_def_cfa_offset 48 + subq $16, %rsp + .cfi_def_cfa_offset 64 + .cfi_offset %rbx, -48 + .cfi_offset %r12, -40 + .cfi_offset %r13, -32 + .cfi_offset %r14, -24 + .cfi_offset %r15, -16 + leaq 8(%rsp), %rdi + movq %rsp, %rsi + callq GOMP_loop_runtime_next@PLT testb %al, %al - je .LBB3_4 -# BB#1: - leaq -32(%rbp), %r14 - leaq -40(%rbp), %r15 - vmovsd .LCPI3_0(%rip), %xmm1 - .align 16, 0x90 -.LBB3_2: # %omp.loadIVBounds + je .LBB3_2 +# %bb.1: + leaq B(%rip), %r15 + leaq A(%rip), %r12 + movsd .LCPI3_0(%rip), %xmm1 # xmm1 = mem[0],zero + leaq 8(%rsp), %r14 + movq %rsp, %r13 + .p2align 4, 0x90 +.LBB3_4: # %polly.par.loadIVBounds # =>This Loop Header: Depth=1 - # Child Loop BB3_8 Depth 2 - # Child Loop BB3_5 Depth 3 - movq -40(%rbp), %r8 - leaq -1(%r8), %rcx - movq -32(%rbp), %rax - cmpq %rcx, %rax - jg .LBB3_3 -# BB#7: # %polly.loop_preheader4.preheader - # in Loop: Header=BB3_2 Depth=1 - addq $-2, %r8 - .align 16, 0x90 -.LBB3_8: # %polly.loop_preheader4 - # Parent Loop BB3_2 Depth=1 + # Child Loop BB3_5 Depth 2 + # Child Loop BB3_6 Depth 3 + movq 8(%rsp), %rax + movq (%rsp), %r8 + decq %r8 + movq %rax, %rdx + shlq $11, %rdx + leaq (%rdx,%rdx,2), %rdx + leaq (%r15,%rdx), %rsi + addq %r12, %rdx + .p2align 4, 0x90 +.LBB3_5: # %polly.loop_header + # Parent Loop BB3_4 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB3_5 Depth 3 - xorl %edx, %edx - .align 16, 0x90 -.LBB3_5: # %polly.loop_header3 - # Parent Loop BB3_2 Depth=1 - # Parent Loop BB3_8 Depth=2 + # Child Loop BB3_6 Depth 3 + movq $-6144, %rdi # imm = 0xE800 + xorl %ecx, %ecx + .p2align 4, 0x90 +.LBB3_6: # %polly.loop_header2 + # Parent Loop BB3_4 Depth=1 + # Parent Loop BB3_5 Depth=2 # => This Inner Loop Header: Depth=3 - movl %edx, %esi - imull %eax, %esi - movl %esi, %edi - sarl $31, %edi - shrl $22, %edi - addl %esi, %edi - andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 - negl %edi - movq %rax, %rcx - shlq $11, %rcx - leal 1(%rsi,%rdi), %ebx - leaq (%rcx,%rcx,2), %rdi - leaq 1(%rdx), %rsi - cmpq $1536, %rsi # imm = 0x600 - vcvtsi2sdl %ebx, %xmm0, %xmm0 - vmulsd %xmm1, %xmm0, %xmm0 - vcvtsd2ss %xmm0, %xmm0, %xmm0 - vmovss %xmm0, A(%rdi,%rdx,4) - vmovss %xmm0, B(%rdi,%rdx,4) - movq %rsi, %rdx - jne .LBB3_5 -# BB#6: # %polly.loop_exit5 - # in Loop: Header=BB3_8 Depth=2 + movl %ecx, %ebx + andl $1023, %ebx # imm = 0x3FF + incl %ebx + xorps %xmm0, %xmm0 + cvtsi2sdl %ebx, %xmm0 + mulsd %xmm1, %xmm0 + cvtsd2ss %xmm0, %xmm0 + movss %xmm0, 6144(%rdx,%rdi) + movss %xmm0, 6144(%rsi,%rdi) + addl %eax, %ecx + addq $4, %rdi + jne .LBB3_6 +# %bb.7: # %polly.loop_exit4 + # in Loop: Header=BB3_5 Depth=2 + addq $6144, %rsi # imm = 0x1800 + addq $6144, %rdx # imm = 0x1800 cmpq %r8, %rax leaq 1(%rax), %rax - jle .LBB3_8 -.LBB3_3: # %omp.checkNext.backedge - # in Loop: Header=BB3_2 Depth=1 + jl .LBB3_5 +# %bb.3: # %polly.par.checkNext.loopexit + # in Loop: Header=BB3_4 Depth=1 movq %r14, %rdi - movq %r15, %rsi - callq GOMP_loop_runtime_next - vmovsd .LCPI3_0(%rip), %xmm1 + movq %r13, %rsi + callq GOMP_loop_runtime_next@PLT + movsd .LCPI3_0(%rip), %xmm1 # xmm1 = mem[0],zero testb %al, %al - jne .LBB3_2 -.LBB3_4: # %omp.exit - callq GOMP_loop_end_nowait - addq $24, %rsp + jne .LBB3_4 +.LBB3_2: # %polly.par.exit + callq GOMP_loop_end_nowait@PLT + addq $16, %rsp + .cfi_def_cfa_offset 48 popq %rbx + .cfi_def_cfa_offset 40 + popq %r12 + .cfi_def_cfa_offset 32 + popq %r13 + .cfi_def_cfa_offset 24 popq %r14 + .cfi_def_cfa_offset 16 popq %r15 - popq %rbp - ret -.Ltmp42: - .size init_array.omp_subfn, .Ltmp42-init_array.omp_subfn + .cfi_def_cfa_offset 8 + retq +.Lfunc_end3: + .size init_array_polly_subfn, .Lfunc_end3-init_array_polly_subfn .cfi_endproc - - .align 16, 0x90 - .type main.omp_subfn,@function -main.omp_subfn: # @main.omp_subfn + # -- End function + .p2align 4, 0x90 # -- Begin function main_polly_subfn + .type main_polly_subfn,@function +main_polly_subfn: # @main_polly_subfn .cfi_startproc -# BB#0: # %omp.setup - pushq %rbp -.Ltmp46: - .cfi_def_cfa_offset 16 -.Ltmp47: - .cfi_offset %rbp, -16 - movq %rsp, %rbp -.Ltmp48: - .cfi_def_cfa_register %rbp +# %bb.0: # %polly.par.setup pushq %r15 + .cfi_def_cfa_offset 16 pushq %r14 - pushq %r13 - pushq %r12 + .cfi_def_cfa_offset 24 pushq %rbx - subq $24, %rsp -.Ltmp49: - .cfi_offset %rbx, -56 -.Ltmp50: - .cfi_offset %r12, -48 -.Ltmp51: - .cfi_offset %r13, -40 -.Ltmp52: - .cfi_offset %r14, -32 -.Ltmp53: - .cfi_offset %r15, -24 - leaq -48(%rbp), %rdi - leaq -56(%rbp), %rsi - callq GOMP_loop_runtime_next + .cfi_def_cfa_offset 32 + subq $16, %rsp + .cfi_def_cfa_offset 48 + .cfi_offset %rbx, -32 + .cfi_offset %r14, -24 + .cfi_offset %r15, -16 + leaq 8(%rsp), %rdi + movq %rsp, %rsi + callq GOMP_loop_runtime_next@PLT testb %al, %al - je .LBB4_4 -# BB#1: - leaq -48(%rbp), %r14 - leaq -56(%rbp), %r15 - .align 16, 0x90 -.LBB4_2: # %omp.loadIVBounds - # =>This Loop Header: Depth=1 - # Child Loop BB4_6 Depth 2 - movq -56(%rbp), %r12 - leaq -1(%r12), %rcx - movq -48(%rbp), %rax + je .LBB4_3 +# %bb.1: + leaq C(%rip), %r15 + leaq 8(%rsp), %r14 + movq %rsp, %rbx + .p2align 4, 0x90 +.LBB4_2: # %polly.par.loadIVBounds + # =>This Inner Loop Header: Depth=1 + movq 8(%rsp), %rax + movq (%rsp), %rcx + decq %rcx + leaq (%rax,%rax,2), %rdi + shlq $11, %rdi + addq %r15, %rdi cmpq %rcx, %rax - jg .LBB4_3 -# BB#5: # %polly.loop_preheader4.preheader - # in Loop: Header=BB4_2 Depth=1 - addq $-2, %r12 - leaq (%rax,%rax,2), %rcx - leaq -1(%rax), %r13 + cmovgeq %rax, %rcx + incq %rcx + subq %rax, %rcx shlq $11, %rcx - leaq C(%rcx), %rbx - .align 16, 0x90 -.LBB4_6: # %polly.loop_preheader4 - # Parent Loop BB4_2 Depth=1 - # => This Inner Loop Header: Depth=2 - movq %rbx, %rdi + leaq (%rcx,%rcx,2), %rdx xorl %esi, %esi - movl $6144, %edx # imm = 0x1800 - callq memset - addq $6144, %rbx # imm = 0x1800 - incq %r13 - cmpq %r12, %r13 - jle .LBB4_6 -.LBB4_3: # %omp.checkNext.backedge - # in Loop: Header=BB4_2 Depth=1 + callq memset@PLT movq %r14, %rdi - movq %r15, %rsi - callq GOMP_loop_runtime_next + movq %rbx, %rsi + callq GOMP_loop_runtime_next@PLT testb %al, %al jne .LBB4_2 -.LBB4_4: # %omp.exit - callq GOMP_loop_end_nowait - addq $24, %rsp +.LBB4_3: # %polly.par.exit + callq GOMP_loop_end_nowait@PLT + addq $16, %rsp + .cfi_def_cfa_offset 32 popq %rbx - popq %r12 - popq %r13 + .cfi_def_cfa_offset 24 popq %r14 + .cfi_def_cfa_offset 16 popq %r15 - popq %rbp - ret -.Ltmp54: - .size main.omp_subfn, .Ltmp54-main.omp_subfn + .cfi_def_cfa_offset 8 + retq +.Lfunc_end4: + .size main_polly_subfn, .Lfunc_end4-main_polly_subfn .cfi_endproc - - .align 16, 0x90 - .type main.omp_subfn1,@function -main.omp_subfn1: # @main.omp_subfn1 + # -- End function + .p2align 4, 0x90 # -- Begin function main_polly_subfn_1 + .type main_polly_subfn_1,@function +main_polly_subfn_1: # @main_polly_subfn_1 .cfi_startproc -# BB#0: # %omp.setup +# %bb.0: # %polly.par.setup pushq %rbp -.Ltmp58: .cfi_def_cfa_offset 16 -.Ltmp59: - .cfi_offset %rbp, -16 - movq %rsp, %rbp -.Ltmp60: - .cfi_def_cfa_register %rbp pushq %r15 + .cfi_def_cfa_offset 24 pushq %r14 + .cfi_def_cfa_offset 32 pushq %r13 + .cfi_def_cfa_offset 40 pushq %r12 + .cfi_def_cfa_offset 48 pushq %rbx - subq $72, %rsp -.Ltmp61: + .cfi_def_cfa_offset 56 + subq $296, %rsp # imm = 0x128 + .cfi_def_cfa_offset 352 .cfi_offset %rbx, -56 -.Ltmp62: .cfi_offset %r12, -48 -.Ltmp63: .cfi_offset %r13, -40 -.Ltmp64: .cfi_offset %r14, -32 -.Ltmp65: .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 jmp .LBB5_1 - .align 16, 0x90 -.LBB5_2: # %omp.loadIVBounds - # in Loop: Header=BB5_1 Depth=1 - movq -56(%rbp), %rax - movq %rax, -112(%rbp) # 8-byte Spill - leaq -1(%rax), %rax - movq -48(%rbp), %rcx - cmpq %rax, %rcx - jg .LBB5_1 -# BB#3: # %polly.loop_preheader4.preheader + .p2align 4, 0x90 +.LBB5_2: # %polly.par.loadIVBounds # in Loop: Header=BB5_1 Depth=1 - leaq -1(%rcx), %rax - movq %rax, -88(%rbp) # 8-byte Spill - addq $-65, -112(%rbp) # 8-byte Folded Spill - movq %rcx, %rax - shlq $9, %rax - leaq (%rax,%rax,2), %rax - leaq C+16(,%rax,4), %rax - movq %rax, -104(%rbp) # 8-byte Spill - .align 16, 0x90 -.LBB5_7: # %polly.loop_preheader4 + movq 40(%rsp), %rdx + movq 32(%rsp), %rax + decq %rax + movq %rax, 136(%rsp) # 8-byte Spill + leaq (%rdx,%rdx,2), %rcx + shlq $11, %rcx + leaq A(%rip), %rax + addq %rax, %rcx + movq %rcx, 24(%rsp) # 8-byte Spill + .p2align 4, 0x90 +.LBB5_3: # %polly.loop_header # Parent Loop BB5_1 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB5_8 Depth 3 - # Child Loop BB5_9 Depth 4 - # Child Loop BB5_12 Depth 5 - # Child Loop BB5_17 Depth 6 - # Child Loop BB5_18 Depth 7 - # Child Loop BB5_14 Depth 5 - movq %rcx, -72(%rbp) # 8-byte Spill - leaq 62(%rcx), %rdi - xorl %edx, %edx - .align 16, 0x90 -.LBB5_8: # %polly.loop_preheader11 + # Child Loop BB5_4 Depth 3 + # Child Loop BB5_5 Depth 4 + # Child Loop BB5_6 Depth 5 + # Child Loop BB5_7 Depth 6 + leaq 63(%rdx), %rsi + leaq B+192(%rip), %r14 + xorl %ecx, %ecx + xorl %eax, %eax + movq %rdx, 168(%rsp) # 8-byte Spill + .p2align 4, 0x90 +.LBB5_4: # %polly.loop_header2 # Parent Loop BB5_1 Depth=1 - # Parent Loop BB5_7 Depth=2 + # Parent Loop BB5_3 Depth=2 # => This Loop Header: Depth=3 - # Child Loop BB5_9 Depth 4 - # Child Loop BB5_12 Depth 5 - # Child Loop BB5_17 Depth 6 - # Child Loop BB5_18 Depth 7 - # Child Loop BB5_14 Depth 5 - movq %rdx, -96(%rbp) # 8-byte Spill - leaq -4(%rdx), %rcx - movq %rdx, %rax - decq %rax - cmovsq %rcx, %rax - movq %rax, %r14 - sarq $63, %r14 - shrq $62, %r14 - addq %rax, %r14 - andq $-4, %r14 - movq %rdx, %rax - orq $63, %rax - leaq -4(%rax), %rdx - movq -104(%rbp), %rcx # 8-byte Reload - leaq (%rcx,%r14,4), %rcx - movq %rcx, -80(%rbp) # 8-byte Spill - leaq B+16(,%r14,4), %rbx - leaq 4(%r14), %rcx - movq %rcx, -64(%rbp) # 8-byte Spill - xorl %r11d, %r11d - .align 16, 0x90 -.LBB5_9: # %polly.loop_header10 + # Child Loop BB5_5 Depth 4 + # Child Loop BB5_6 Depth 5 + # Child Loop BB5_7 Depth 6 + movq %rax, 144(%rsp) # 8-byte Spill + movq %rcx, 152(%rsp) # 8-byte Spill + shlq $6, %rcx + leaq 16(%rcx), %rdi + leaq 32(%rcx), %rbp + leaq 48(%rcx), %r15 + movq 24(%rsp), %r9 # 8-byte Reload + movq %r14, 160(%rsp) # 8-byte Spill + xorl %eax, %eax + .p2align 4, 0x90 +.LBB5_5: # %polly.loop_header8 # Parent Loop BB5_1 Depth=1 - # Parent Loop BB5_7 Depth=2 - # Parent Loop BB5_8 Depth=3 + # Parent Loop BB5_3 Depth=2 + # Parent Loop BB5_4 Depth=3 # => This Loop Header: Depth=4 - # Child Loop BB5_12 Depth 5 - # Child Loop BB5_17 Depth 6 - # Child Loop BB5_18 Depth 7 - # Child Loop BB5_14 Depth 5 - movabsq $9223372036854775744, %rcx # imm = 0x7FFFFFFFFFFFFFC0 - cmpq %rcx, -72(%rbp) # 8-byte Folded Reload - jg .LBB5_15 -# BB#10: # %polly.loop_header17.preheader - # in Loop: Header=BB5_9 Depth=4 - movq %r11, %r15 - orq $63, %r15 - cmpq %r15, %r11 - movq -88(%rbp), %rcx # 8-byte Reload - jle .LBB5_11 - .align 16, 0x90 -.LBB5_14: # %polly.loop_exit28.us - # Parent Loop BB5_1 Depth=1 - # Parent Loop BB5_7 Depth=2 - # Parent Loop BB5_8 Depth=3 - # Parent Loop BB5_9 Depth=4 - # => This Inner Loop Header: Depth=5 - incq %rcx - cmpq %rdi, %rcx - jle .LBB5_14 - jmp .LBB5_15 - .align 16, 0x90 -.LBB5_11: # in Loop: Header=BB5_9 Depth=4 - decq %r15 - movq -80(%rbp), %r13 # 8-byte Reload - movq -72(%rbp), %rcx # 8-byte Reload - .align 16, 0x90 -.LBB5_12: # %polly.loop_header26.preheader + # Child Loop BB5_6 Depth 5 + # Child Loop BB5_7 Depth 6 + movq %rax, 176(%rsp) # 8-byte Spill + movq %r9, 184(%rsp) # 8-byte Spill + movq %rdx, %rax + .p2align 4, 0x90 +.LBB5_6: # %polly.loop_header14 # Parent Loop BB5_1 Depth=1 - # Parent Loop BB5_7 Depth=2 - # Parent Loop BB5_8 Depth=3 - # Parent Loop BB5_9 Depth=4 + # Parent Loop BB5_3 Depth=2 + # Parent Loop BB5_4 Depth=3 + # Parent Loop BB5_5 Depth=4 # => This Loop Header: Depth=5 - # Child Loop BB5_17 Depth 6 - # Child Loop BB5_18 Depth 7 - cmpq %rax, -64(%rbp) # 8-byte Folded Reload - movq %rbx, %r12 - movq %r11, %r8 - jg .LBB5_13 - .align 16, 0x90 -.LBB5_17: # %polly.loop_header35.preheader + # Child Loop BB5_7 Depth 6 + leaq (%rax,%rax,2), %rbx + shlq $11, %rbx + leaq C(%rip), %rdx + addq %rdx, %rbx + leaq (%rbx,%rcx,4), %r8 + leaq (%rbx,%rdi,4), %rdx + leaq (%rbx,%rbp,4), %r13 + leaq (%rbx,%r15,4), %r10 + movups (%rbx,%rcx,4), %xmm8 + movups 16(%rbx,%rcx,4), %xmm0 + movaps %xmm0, 96(%rsp) # 16-byte Spill + movups 32(%rbx,%rcx,4), %xmm6 + movups 48(%rbx,%rcx,4), %xmm1 + movups (%rbx,%rdi,4), %xmm15 + movups 16(%rbx,%rdi,4), %xmm0 + movaps %xmm0, (%rsp) # 16-byte Spill + movups 32(%rbx,%rdi,4), %xmm0 + movaps %xmm0, 48(%rsp) # 16-byte Spill + movups 48(%rbx,%rdi,4), %xmm0 + movaps %xmm0, 64(%rsp) # 16-byte Spill + movups (%rbx,%rbp,4), %xmm11 + movups 16(%rbx,%rbp,4), %xmm0 + movaps %xmm0, 112(%rsp) # 16-byte Spill + movups 32(%rbx,%rbp,4), %xmm12 + movups 48(%rbx,%rbp,4), %xmm0 + movaps %xmm0, 80(%rsp) # 16-byte Spill + movups (%rbx,%r15,4), %xmm9 + movups 16(%rbx,%r15,4), %xmm13 + movups 32(%rbx,%r15,4), %xmm2 + movups 48(%rbx,%r15,4), %xmm3 + movq $-256, %r12 + movq %r14, %r11 + .p2align 4, 0x90 +.LBB5_7: # %vector.ph # Parent Loop BB5_1 Depth=1 - # Parent Loop BB5_7 Depth=2 - # Parent Loop BB5_8 Depth=3 - # Parent Loop BB5_9 Depth=4 - # Parent Loop BB5_12 Depth=5 - # => This Loop Header: Depth=6 - # Child Loop BB5_18 Depth 7 - leaq (%rcx,%rcx,2), %rsi - shlq $11, %rsi - vbroadcastss A(%rsi,%r8,4), %xmm0 - movq %r13, %r9 - movq %r12, %r10 - movq %r14, %rsi -.LBB5_18: # %polly.loop_header35 - # Parent Loop BB5_1 Depth=1 - # Parent Loop BB5_7 Depth=2 - # Parent Loop BB5_8 Depth=3 - # Parent Loop BB5_9 Depth=4 - # Parent Loop BB5_12 Depth=5 - # Parent Loop BB5_17 Depth=6 - # => This Inner Loop Header: Depth=7 - vmulps (%r10), %xmm0, %xmm1 - vaddps (%r9), %xmm1, %xmm1 - vmovaps %xmm1, (%r9) - addq $16, %r9 - addq $16, %r10 - addq $4, %rsi - cmpq %rdx, %rsi - jle .LBB5_18 -# BB#16: # %polly.loop_exit37 - # in Loop: Header=BB5_17 Depth=6 - addq $6144, %r12 # imm = 0x1800 - cmpq %r15, %r8 - leaq 1(%r8), %r8 - jle .LBB5_17 - .align 16, 0x90 -.LBB5_13: # %polly.loop_exit28 - # in Loop: Header=BB5_12 Depth=5 - addq $6144, %r13 # imm = 0x1800 - cmpq %rdi, %rcx - leaq 1(%rcx), %rcx - jle .LBB5_12 - .align 16, 0x90 -.LBB5_15: # %polly.loop_exit19 - # in Loop: Header=BB5_9 Depth=4 - addq $393216, %rbx # imm = 0x60000 - cmpq $1472, %r11 # imm = 0x5C0 - leaq 64(%r11), %r11 - jl .LBB5_9 -# BB#5: # %polly.loop_exit12 - # in Loop: Header=BB5_8 Depth=3 - movq -96(%rbp), %rdx # 8-byte Reload - cmpq $1472, %rdx # imm = 0x5C0 - leaq 64(%rdx), %rdx - jl .LBB5_8 -# BB#6: # %polly.loop_exit5 - # in Loop: Header=BB5_7 Depth=2 - addq $64, -88(%rbp) # 8-byte Folded Spill - addq $393216, -104(%rbp) # 8-byte Folded Spill + # Parent Loop BB5_3 Depth=2 + # Parent Loop BB5_4 Depth=3 + # Parent Loop BB5_5 Depth=4 + # Parent Loop BB5_6 Depth=5 + # => This Inner Loop Header: Depth=6 + movaps %xmm12, 208(%rsp) # 16-byte Spill + movaps %xmm2, 224(%rsp) # 16-byte Spill + movaps %xmm3, 240(%rsp) # 16-byte Spill + movaps %xmm8, %xmm10 + movaps 96(%rsp), %xmm7 # 16-byte Reload + unpcklps %xmm7, %xmm10 # xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] + movaps %xmm1, %xmm4 + shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] + shufps $36, %xmm4, %xmm10 # xmm10 = xmm10[0,1],xmm4[2,0] + movaps %xmm7, %xmm5 + shufps $17, %xmm8, %xmm5 # xmm5 = xmm5[1,0],xmm8[1,0] + movaps %xmm6, %xmm4 + unpcklps %xmm1, %xmm4 # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] + shufps $226, %xmm4, %xmm5 # xmm5 = xmm5[2,0],xmm4[2,3] + movaps %xmm8, %xmm12 + unpckhps %xmm7, %xmm12 # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] + movaps %xmm1, %xmm4 + shufps $34, %xmm6, %xmm4 # xmm4 = xmm4[2,0],xmm6[2,0] + shufps $36, %xmm4, %xmm12 # xmm12 = xmm12[0,1],xmm4[2,0] + shufps $51, %xmm8, %xmm7 # xmm7 = xmm7[3,0],xmm8[3,0] + unpckhps %xmm1, %xmm6 # xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] + shufps $226, %xmm6, %xmm7 # xmm7 = xmm7[2,0],xmm6[2,3] + movaps -160(%r11), %xmm0 + movaps -144(%r11), %xmm1 + movaps %xmm1, %xmm6 + shufps $0, %xmm0, %xmm6 # xmm6 = xmm6[0,0],xmm0[0,0] + movaps -192(%r11), %xmm3 + movaps -176(%r11), %xmm4 + movaps %xmm3, %xmm8 + unpcklps %xmm4, %xmm8 # xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] + shufps $36, %xmm6, %xmm8 # xmm8 = xmm8[0,1],xmm6[2,0] + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] + movaps %xmm4, %xmm6 + shufps $17, %xmm3, %xmm6 # xmm6 = xmm6[1,0],xmm3[1,0] + shufps $226, %xmm2, %xmm6 # xmm6 = xmm6[2,0],xmm2[2,3] + movaps %xmm1, %xmm2 + shufps $34, %xmm0, %xmm2 # xmm2 = xmm2[2,0],xmm0[2,0] + movaps %xmm3, %xmm14 + unpckhps %xmm4, %xmm14 # xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] + shufps $36, %xmm2, %xmm14 # xmm14 = xmm14[0,1],xmm2[2,0] + unpckhps %xmm1, %xmm0 # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] + shufps $51, %xmm3, %xmm4 # xmm4 = xmm4[3,0],xmm3[3,0] + shufps $226, %xmm0, %xmm4 # xmm4 = xmm4[2,0],xmm0[2,3] + movss 256(%r9,%r12), %xmm0 # xmm0 = mem[0],zero,zero,zero + shufps $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] + mulps %xmm0, %xmm8 + addps %xmm10, %xmm8 + mulps %xmm0, %xmm6 + addps %xmm5, %xmm6 + mulps %xmm0, %xmm14 + addps %xmm12, %xmm14 + mulps %xmm0, %xmm4 + movaps %xmm0, %xmm5 + addps %xmm7, %xmm4 + movaps %xmm14, %xmm0 + unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] + movaps %xmm6, %xmm1 + shufps $51, %xmm8, %xmm1 # xmm1 = xmm1[3,0],xmm8[3,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, 272(%rsp) # 16-byte Spill + movaps %xmm4, %xmm0 + shufps $34, %xmm14, %xmm0 # xmm0 = xmm0[2,0],xmm14[2,0] + movaps %xmm8, %xmm1 + unpckhps %xmm6, %xmm1 # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] + shufps $36, %xmm0, %xmm1 # xmm1 = xmm1[0,1],xmm0[2,0] + movaps %xmm1, 256(%rsp) # 16-byte Spill + movaps %xmm14, %xmm0 + unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] + movaps %xmm6, %xmm1 + shufps $17, %xmm8, %xmm1 # xmm1 = xmm1[1,0],xmm8[1,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, 96(%rsp) # 16-byte Spill + shufps $0, %xmm14, %xmm4 # xmm4 = xmm4[0,0],xmm14[0,0] + unpcklps %xmm6, %xmm8 # xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] + shufps $36, %xmm4, %xmm8 # xmm8 = xmm8[0,1],xmm4[2,0] + movaps %xmm15, %xmm14 + movaps (%rsp), %xmm4 # 16-byte Reload + unpcklps %xmm4, %xmm14 # xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] + movaps 64(%rsp), %xmm1 # 16-byte Reload + movaps %xmm1, %xmm0 + movaps 48(%rsp), %xmm3 # 16-byte Reload + shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] + shufps $36, %xmm0, %xmm14 # xmm14 = xmm14[0,1],xmm0[2,0] + movaps %xmm4, %xmm12 + shufps $17, %xmm15, %xmm12 # xmm12 = xmm12[1,0],xmm15[1,0] + movaps %xmm3, %xmm2 + unpcklps %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] + shufps $226, %xmm2, %xmm12 # xmm12 = xmm12[2,0],xmm2[2,3] + movaps %xmm15, %xmm7 + unpckhps %xmm4, %xmm7 # xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] + movaps %xmm1, %xmm2 + shufps $34, %xmm3, %xmm2 # xmm2 = xmm2[2,0],xmm3[2,0] + shufps $36, %xmm2, %xmm7 # xmm7 = xmm7[0,1],xmm2[2,0] + shufps $51, %xmm15, %xmm4 # xmm4 = xmm4[3,0],xmm15[3,0] + unpckhps %xmm1, %xmm3 # xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] + shufps $226, %xmm3, %xmm4 # xmm4 = xmm4[2,0],xmm3[2,3] + movaps %xmm4, (%rsp) # 16-byte Spill + movaps -96(%r11), %xmm2 + movaps -80(%r11), %xmm1 + movaps %xmm1, %xmm4 + shufps $0, %xmm2, %xmm4 # xmm4 = xmm4[0,0],xmm2[0,0] + movaps -112(%r11), %xmm10 + movaps -128(%r11), %xmm0 + movaps %xmm0, %xmm15 + unpcklps %xmm10, %xmm15 # xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] + shufps $36, %xmm4, %xmm15 # xmm15 = xmm15[0,1],xmm4[2,0] + movaps %xmm2, %xmm4 + unpcklps %xmm1, %xmm4 # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] + movaps %xmm10, %xmm6 + shufps $17, %xmm0, %xmm6 # xmm6 = xmm6[1,0],xmm0[1,0] + shufps $226, %xmm4, %xmm6 # xmm6 = xmm6[2,0],xmm4[2,3] + movaps %xmm1, %xmm3 + shufps $34, %xmm2, %xmm3 # xmm3 = xmm3[2,0],xmm2[2,0] + movaps %xmm0, %xmm4 + unpckhps %xmm10, %xmm4 # xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] + shufps $36, %xmm3, %xmm4 # xmm4 = xmm4[0,1],xmm3[2,0] + unpckhps %xmm1, %xmm2 # xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] + shufps $51, %xmm0, %xmm10 # xmm10 = xmm10[3,0],xmm0[3,0] + shufps $226, %xmm2, %xmm10 # xmm10 = xmm10[2,0],xmm2[2,3] + movaps %xmm5, 192(%rsp) # 16-byte Spill + mulps %xmm5, %xmm15 + addps %xmm14, %xmm15 + mulps %xmm5, %xmm6 + addps %xmm12, %xmm6 + mulps %xmm5, %xmm4 + addps %xmm7, %xmm4 + mulps %xmm5, %xmm10 + addps (%rsp), %xmm10 # 16-byte Folded Reload + movaps %xmm4, %xmm0 + unpckhps %xmm10, %xmm0 # xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] + movaps %xmm6, %xmm1 + shufps $51, %xmm15, %xmm1 # xmm1 = xmm1[3,0],xmm15[3,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, 64(%rsp) # 16-byte Spill + movaps %xmm10, %xmm0 + shufps $34, %xmm4, %xmm0 # xmm0 = xmm0[2,0],xmm4[2,0] + movaps %xmm15, %xmm1 + unpckhps %xmm6, %xmm1 # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] + shufps $36, %xmm0, %xmm1 # xmm1 = xmm1[0,1],xmm0[2,0] + movaps %xmm1, 48(%rsp) # 16-byte Spill + movaps %xmm4, %xmm0 + unpcklps %xmm10, %xmm0 # xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] + movaps %xmm6, %xmm1 + shufps $17, %xmm15, %xmm1 # xmm1 = xmm1[1,0],xmm15[1,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, (%rsp) # 16-byte Spill + shufps $0, %xmm4, %xmm10 # xmm10 = xmm10[0,0],xmm4[0,0] + unpcklps %xmm6, %xmm15 # xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] + shufps $36, %xmm10, %xmm15 # xmm15 = xmm15[0,1],xmm10[2,0] + movaps %xmm11, %xmm10 + movaps 112(%rsp), %xmm14 # 16-byte Reload + unpcklps %xmm14, %xmm10 # xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] + movaps 80(%rsp), %xmm2 # 16-byte Reload + movaps %xmm2, %xmm0 + movaps 208(%rsp), %xmm3 # 16-byte Reload + shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] + shufps $36, %xmm0, %xmm10 # xmm10 = xmm10[0,1],xmm0[2,0] + movaps %xmm14, %xmm12 + shufps $17, %xmm11, %xmm12 # xmm12 = xmm12[1,0],xmm11[1,0] + movaps %xmm3, %xmm0 + unpcklps %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] + shufps $226, %xmm0, %xmm12 # xmm12 = xmm12[2,0],xmm0[2,3] + movaps %xmm11, %xmm0 + unpckhps %xmm14, %xmm0 # xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] + movaps %xmm2, %xmm1 + shufps $34, %xmm3, %xmm1 # xmm1 = xmm1[2,0],xmm3[2,0] + shufps $36, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[2,0] + shufps $51, %xmm11, %xmm14 # xmm14 = xmm14[3,0],xmm11[3,0] + unpckhps %xmm2, %xmm3 # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] + shufps $226, %xmm3, %xmm14 # xmm14 = xmm14[2,0],xmm3[2,3] + movaps -32(%r11), %xmm1 + movaps -16(%r11), %xmm2 + movaps %xmm2, %xmm3 + shufps $0, %xmm1, %xmm3 # xmm3 = xmm3[0,0],xmm1[0,0] + movaps -48(%r11), %xmm4 + movaps -64(%r11), %xmm5 + movaps %xmm5, %xmm11 + unpcklps %xmm4, %xmm11 # xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] + shufps $36, %xmm3, %xmm11 # xmm11 = xmm11[0,1],xmm3[2,0] + movaps %xmm1, %xmm3 + unpcklps %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] + movaps %xmm4, %xmm7 + shufps $17, %xmm5, %xmm7 # xmm7 = xmm7[1,0],xmm5[1,0] + shufps $226, %xmm3, %xmm7 # xmm7 = xmm7[2,0],xmm3[2,3] + movaps %xmm2, %xmm3 + shufps $34, %xmm1, %xmm3 # xmm3 = xmm3[2,0],xmm1[2,0] + movaps %xmm5, %xmm6 + unpckhps %xmm4, %xmm6 # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] + shufps $36, %xmm3, %xmm6 # xmm6 = xmm6[0,1],xmm3[2,0] + unpckhps %xmm2, %xmm1 # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] + shufps $51, %xmm5, %xmm4 # xmm4 = xmm4[3,0],xmm5[3,0] + shufps $226, %xmm1, %xmm4 # xmm4 = xmm4[2,0],xmm1[2,3] + movaps 192(%rsp), %xmm1 # 16-byte Reload + mulps %xmm1, %xmm11 + addps %xmm10, %xmm11 + mulps %xmm1, %xmm7 + addps %xmm12, %xmm7 + mulps %xmm1, %xmm6 + addps %xmm0, %xmm6 + mulps %xmm1, %xmm4 + addps %xmm14, %xmm4 + movaps %xmm6, %xmm0 + unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] + movaps %xmm7, %xmm1 + shufps $51, %xmm11, %xmm1 # xmm1 = xmm1[3,0],xmm11[3,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, 80(%rsp) # 16-byte Spill + movaps %xmm4, %xmm0 + shufps $34, %xmm6, %xmm0 # xmm0 = xmm0[2,0],xmm6[2,0] + movaps %xmm11, %xmm12 + unpckhps %xmm7, %xmm12 # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] + shufps $36, %xmm0, %xmm12 # xmm12 = xmm12[0,1],xmm0[2,0] + movaps %xmm6, %xmm0 + unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] + movaps %xmm7, %xmm1 + shufps $17, %xmm11, %xmm1 # xmm1 = xmm1[1,0],xmm11[1,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, 112(%rsp) # 16-byte Spill + shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] + unpcklps %xmm7, %xmm11 # xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] + shufps $36, %xmm4, %xmm11 # xmm11 = xmm11[0,1],xmm4[2,0] + movaps %xmm9, %xmm10 + unpcklps %xmm13, %xmm10 # xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] + movaps 240(%rsp), %xmm2 # 16-byte Reload + movaps %xmm2, %xmm0 + movaps 224(%rsp), %xmm3 # 16-byte Reload + shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] + shufps $36, %xmm0, %xmm10 # xmm10 = xmm10[0,1],xmm0[2,0] + movaps %xmm13, %xmm14 + shufps $17, %xmm9, %xmm14 # xmm14 = xmm14[1,0],xmm9[1,0] + movaps %xmm3, %xmm0 + unpcklps %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] + shufps $226, %xmm0, %xmm14 # xmm14 = xmm14[2,0],xmm0[2,3] + movaps %xmm9, %xmm0 + unpckhps %xmm13, %xmm0 # xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] + movaps %xmm2, %xmm1 + shufps $34, %xmm3, %xmm1 # xmm1 = xmm1[2,0],xmm3[2,0] + shufps $36, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[2,0] + shufps $51, %xmm9, %xmm13 # xmm13 = xmm13[3,0],xmm9[3,0] + unpckhps %xmm2, %xmm3 # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] + shufps $226, %xmm3, %xmm13 # xmm13 = xmm13[2,0],xmm3[2,3] + movaps 32(%r11), %xmm1 + movaps 48(%r11), %xmm2 + movaps %xmm2, %xmm3 + shufps $0, %xmm1, %xmm3 # xmm3 = xmm3[0,0],xmm1[0,0] + movaps 16(%r11), %xmm4 + movaps (%r11), %xmm5 + movaps %xmm5, %xmm9 + unpcklps %xmm4, %xmm9 # xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] + shufps $36, %xmm3, %xmm9 # xmm9 = xmm9[0,1],xmm3[2,0] + movaps %xmm1, %xmm3 + unpcklps %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] + movaps %xmm4, %xmm7 + shufps $17, %xmm5, %xmm7 # xmm7 = xmm7[1,0],xmm5[1,0] + shufps $226, %xmm3, %xmm7 # xmm7 = xmm7[2,0],xmm3[2,3] + movaps %xmm2, %xmm3 + shufps $34, %xmm1, %xmm3 # xmm3 = xmm3[2,0],xmm1[2,0] + movaps %xmm5, %xmm6 + unpckhps %xmm4, %xmm6 # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] + shufps $36, %xmm3, %xmm6 # xmm6 = xmm6[0,1],xmm3[2,0] + unpckhps %xmm2, %xmm1 # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] + shufps $51, %xmm5, %xmm4 # xmm4 = xmm4[3,0],xmm5[3,0] + shufps $226, %xmm1, %xmm4 # xmm4 = xmm4[2,0],xmm1[2,3] + movaps 192(%rsp), %xmm1 # 16-byte Reload + mulps %xmm1, %xmm9 + addps %xmm10, %xmm9 + mulps %xmm1, %xmm7 + addps %xmm14, %xmm7 + mulps %xmm1, %xmm6 + addps %xmm0, %xmm6 + mulps %xmm1, %xmm4 + addps %xmm13, %xmm4 + movaps %xmm6, %xmm0 + unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] + movaps %xmm7, %xmm3 + shufps $51, %xmm9, %xmm3 # xmm3 = xmm3[3,0],xmm9[3,0] + shufps $226, %xmm0, %xmm3 # xmm3 = xmm3[2,0],xmm0[2,3] + movaps %xmm4, %xmm0 + shufps $34, %xmm6, %xmm0 # xmm0 = xmm0[2,0],xmm6[2,0] + movaps %xmm9, %xmm2 + unpckhps %xmm7, %xmm2 # xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] + shufps $36, %xmm0, %xmm2 # xmm2 = xmm2[0,1],xmm0[2,0] + movaps %xmm6, %xmm0 + unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] + movaps %xmm7, %xmm13 + shufps $17, %xmm9, %xmm13 # xmm13 = xmm13[1,0],xmm9[1,0] + shufps $226, %xmm0, %xmm13 # xmm13 = xmm13[2,0],xmm0[2,3] + shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] + movaps 256(%rsp), %xmm6 # 16-byte Reload + movaps 272(%rsp), %xmm1 # 16-byte Reload + unpcklps %xmm7, %xmm9 # xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] + shufps $36, %xmm4, %xmm9 # xmm9 = xmm9[0,1],xmm4[2,0] + addq $6144, %r11 # imm = 0x1800 + addq $4, %r12 + jne .LBB5_7 +# %bb.8: # %polly.loop_exit22 + # in Loop: Header=BB5_6 Depth=5 + movups %xmm8, (%r8) + movaps 96(%rsp), %xmm0 # 16-byte Reload + movups %xmm0, 16(%r8) + movups %xmm6, 32(%r8) + movups %xmm1, 48(%r8) + movaps 64(%rsp), %xmm0 # 16-byte Reload + movups %xmm0, 48(%rdx) + movaps 48(%rsp), %xmm0 # 16-byte Reload + movups %xmm0, 32(%rdx) + movaps (%rsp), %xmm0 # 16-byte Reload + movups %xmm0, 16(%rdx) + movups %xmm15, (%rdx) + movaps 80(%rsp), %xmm0 # 16-byte Reload + movups %xmm0, 48(%r13) + movaps 112(%rsp), %xmm0 # 16-byte Reload + movups %xmm0, 16(%r13) + movups %xmm11, (%r13) + movups %xmm12, 32(%r13) + movups %xmm3, 48(%r10) + movups %xmm13, 16(%r10) + movups %xmm9, (%r10) + movups %xmm2, 32(%r10) + addq $6144, %r9 # imm = 0x1800 + cmpq %rsi, %rax + leaq 1(%rax), %rax + jl .LBB5_6 +# %bb.9: # %polly.loop_exit16 + # in Loop: Header=BB5_5 Depth=4 + movq 176(%rsp), %rax # 8-byte Reload + addq $64, %rax + addq $393216, %r14 # imm = 0x60000 + movq 184(%rsp), %r9 # 8-byte Reload + addq $256, %r9 # imm = 0x100 + cmpq $1536, %rax # imm = 0x600 + movq 168(%rsp), %rdx # 8-byte Reload + jb .LBB5_5 +# %bb.10: # %polly.loop_exit10 + # in Loop: Header=BB5_4 Depth=3 + movq 144(%rsp), %rax # 8-byte Reload + addq $64, %rax + movq 152(%rsp), %rcx # 8-byte Reload + incq %rcx + movq 160(%rsp), %r14 # 8-byte Reload + addq $256, %r14 # imm = 0x100 + cmpq $1536, %rax # imm = 0x600 + jb .LBB5_4 +# %bb.11: # %polly.loop_exit4 + # in Loop: Header=BB5_3 Depth=2 + addq $64, %rdx + addq $393216, 24(%rsp) # 8-byte Folded Spill # imm = 0x60000 - movq -72(%rbp), %rcx # 8-byte Reload - cmpq -112(%rbp), %rcx # 8-byte Folded Reload - leaq 64(%rcx), %rcx - jle .LBB5_7 -.LBB5_1: # %omp.setup + cmpq 136(%rsp), %rdx # 8-byte Folded Reload + jle .LBB5_3 +.LBB5_1: # %polly.par.setup # =>This Loop Header: Depth=1 - # Child Loop BB5_7 Depth 2 - # Child Loop BB5_8 Depth 3 - # Child Loop BB5_9 Depth 4 - # Child Loop BB5_12 Depth 5 - # Child Loop BB5_17 Depth 6 - # Child Loop BB5_18 Depth 7 - # Child Loop BB5_14 Depth 5 - leaq -48(%rbp), %rdi - leaq -56(%rbp), %rsi - callq GOMP_loop_runtime_next + # Child Loop BB5_3 Depth 2 + # Child Loop BB5_4 Depth 3 + # Child Loop BB5_5 Depth 4 + # Child Loop BB5_6 Depth 5 + # Child Loop BB5_7 Depth 6 + leaq 40(%rsp), %rdi + leaq 32(%rsp), %rsi + callq GOMP_loop_runtime_next@PLT testb %al, %al jne .LBB5_2 -# BB#4: # %omp.exit - callq GOMP_loop_end_nowait - addq $72, %rsp +# %bb.12: # %polly.par.exit + callq GOMP_loop_end_nowait@PLT + addq $296, %rsp # imm = 0x128 + .cfi_def_cfa_offset 56 popq %rbx + .cfi_def_cfa_offset 48 popq %r12 + .cfi_def_cfa_offset 40 popq %r13 + .cfi_def_cfa_offset 32 popq %r14 + .cfi_def_cfa_offset 24 popq %r15 + .cfi_def_cfa_offset 16 popq %rbp - ret -.Ltmp66: - .size main.omp_subfn1, .Ltmp66-main.omp_subfn1 + .cfi_def_cfa_offset 8 + retq +.Lfunc_end5: + .size main_polly_subfn_1, .Lfunc_end5-main_polly_subfn_1 .cfi_endproc - + # -- End function .type A,@object # @A .comm A,9437184,16 .type B,@object # @B @@ -745,10 +854,11 @@ main.omp_subfn1: # @main.omp_subfn1 .type .L.str,@object # @.str .section .rodata.str1.1,"aMS",@progbits,1 .L.str: - .asciz "%lf " + .asciz "%lf " .size .L.str, 5 .type C,@object # @C .comm C,9437184,16 + .ident "clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)" .section ".note.GNU-stack","",@progbits diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe Binary files differdeleted file mode 100755 index 36b788ea9ac..00000000000 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe +++ /dev/null diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll Binary files differindex 9d1f9ad098f..9294896bcb3 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.s b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.s index 485d230bc39..194fdb144c9 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.s +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.s @@ -1,385 +1,645 @@ - .file "matmul.polly.interchanged+tiled+vector.ll" + .text + .file "matmul.c" .section .rodata.cst8,"aM",@progbits,8 - .align 8 + .p2align 3 # -- Begin function init_array .LCPI0_0: .quad 4602678819172646912 # double 0.5 .text .globl init_array - .align 16, 0x90 + .p2align 4, 0x90 .type init_array,@function init_array: # @init_array .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp2: .cfi_def_cfa_offset 16 -.Ltmp3: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp4: .cfi_def_cfa_register %rbp + leaq B(%rip), %rax + leaq A(%rip), %rcx xorl %r8d, %r8d - vmovsd .LCPI0_0(%rip), %xmm0 - .align 16, 0x90 -.LBB0_1: # %polly.loop_preheader3 + movsd .LCPI0_0(%rip), %xmm0 # xmm0 = mem[0],zero + xorl %r9d, %r9d + .p2align 4, 0x90 +.LBB0_1: # %polly.loop_header # =>This Loop Header: Depth=1 # Child Loop BB0_2 Depth 2 - xorl %ecx, %ecx - .align 16, 0x90 -.LBB0_2: # %polly.loop_header2 + movl $1, %edi + xorl %edx, %edx + .p2align 4, 0x90 +.LBB0_2: # %polly.loop_header1 # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 - movl %ecx, %edx - imull %r8d, %edx movl %edx, %esi - sarl $31, %esi - shrl $22, %esi - addl %edx, %esi - andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 - negl %esi - movq %r8, %rax - shlq $11, %rax - leal 1(%rdx,%rsi), %edi - leaq (%rax,%rax,2), %rsi - leaq 1(%rcx), %rdx - cmpq $1536, %rdx # imm = 0x600 - vcvtsi2sdl %edi, %xmm0, %xmm1 - vmulsd %xmm0, %xmm1, %xmm1 - vcvtsd2ss %xmm1, %xmm1, %xmm1 - vmovss %xmm1, A(%rsi,%rcx,4) - vmovss %xmm1, B(%rsi,%rcx,4) - movq %rdx, %rcx + andl $1022, %esi # imm = 0x3FE + orl $1, %esi + xorps %xmm1, %xmm1 + cvtsi2sdl %esi, %xmm1 + mulsd %xmm0, %xmm1 + cvtsd2ss %xmm1, %xmm1 + movss %xmm1, -4(%rcx,%rdi,4) + movss %xmm1, -4(%rax,%rdi,4) + leal (%r9,%rdx), %esi + andl $1023, %esi # imm = 0x3FF + addl $1, %esi + xorps %xmm1, %xmm1 + cvtsi2sdl %esi, %xmm1 + mulsd %xmm0, %xmm1 + cvtsd2ss %xmm1, %xmm1 + movss %xmm1, (%rcx,%rdi,4) + movss %xmm1, (%rax,%rdi,4) + addq $2, %rdi + addl %r8d, %edx + cmpq $1537, %rdi # imm = 0x601 jne .LBB0_2 -# BB#3: # %polly.loop_exit4 +# %bb.3: # %polly.loop_exit3 # in Loop: Header=BB0_1 Depth=1 - incq %r8 - cmpq $1536, %r8 # imm = 0x600 + addq $1, %r9 + addq $6144, %rax # imm = 0x1800 + addq $6144, %rcx # imm = 0x1800 + addl $2, %r8d + cmpq $1536, %r9 # imm = 0x600 jne .LBB0_1 -# BB#4: # %polly.loop_exit +# %bb.4: # %polly.exiting popq %rbp - ret -.Ltmp5: - .size init_array, .Ltmp5-init_array + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end0: + .size init_array, .Lfunc_end0-init_array .cfi_endproc - - .globl print_array - .align 16, 0x90 + # -- End function + .globl print_array # -- Begin function print_array + .p2align 4, 0x90 .type print_array,@function print_array: # @print_array .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp9: .cfi_def_cfa_offset 16 -.Ltmp10: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp11: .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 + pushq %r13 pushq %r12 pushq %rbx -.Ltmp12: - .cfi_offset %rbx, -48 -.Ltmp13: - .cfi_offset %r12, -40 -.Ltmp14: + pushq %rax + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 .cfi_offset %r14, -32 -.Ltmp15: .cfi_offset %r15, -24 - xorl %r14d, %r14d - movl $C, %r15d - .align 16, 0x90 + leaq C(%rip), %r13 + xorl %eax, %eax + movl $3435973837, %r12d # imm = 0xCCCCCCCD + leaq .L.str(%rip), %r14 + .p2align 4, 0x90 .LBB1_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB1_2 Depth 2 - movq stdout(%rip), %rax - movq %r15, %r12 + movq %rax, -48(%rbp) # 8-byte Spill + movq stdout(%rip), %rsi xorl %ebx, %ebx - .align 16, 0x90 + .p2align 4, 0x90 .LBB1_2: # %for.body3 # Parent Loop BB1_1 Depth=1 # => This Inner Loop Header: Depth=2 - vmovss (%r12), %xmm0 - vcvtss2sd %xmm0, %xmm0, %xmm0 - movq %rax, %rdi - movl $.L.str, %esi + movl %ebx, %eax + imulq %r12, %rax + shrq $38, %rax + leal (%rax,%rax,4), %r15d + shll $4, %r15d + addl $79, %r15d + movss (%r13,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + cvtss2sd %xmm0, %xmm0 movb $1, %al + movq %rsi, %rdi + movq %r14, %rsi callq fprintf - movslq %ebx, %rax - imulq $1717986919, %rax, %rcx # imm = 0x66666667 - movq %rcx, %rdx - shrq $63, %rdx - sarq $37, %rcx - addl %edx, %ecx - imull $80, %ecx, %ecx - subl %ecx, %eax - cmpl $79, %eax + cmpl %ebx, %r15d jne .LBB1_4 -# BB#3: # %if.then +# %bb.3: # %if.then # in Loop: Header=BB1_2 Depth=2 movq stdout(%rip), %rsi movl $10, %edi - callq fputc + callq fputc@PLT .LBB1_4: # %for.inc # in Loop: Header=BB1_2 Depth=2 - addq $4, %r12 - incq %rbx - movq stdout(%rip), %rax + addq $1, %rbx + movq stdout(%rip), %rsi cmpq $1536, %rbx # imm = 0x600 jne .LBB1_2 -# BB#5: # %for.end +# %bb.5: # %for.end # in Loop: Header=BB1_1 Depth=1 movl $10, %edi - movq %rax, %rsi - callq fputc - addq $6144, %r15 # imm = 0x1800 - incq %r14 - cmpq $1536, %r14 # imm = 0x600 + callq fputc@PLT + movq -48(%rbp), %rax # 8-byte Reload + addq $1, %rax + addq $6144, %r13 # imm = 0x1800 + cmpq $1536, %rax # imm = 0x600 jne .LBB1_1 -# BB#6: # %for.end12 +# %bb.6: # %for.end12 + addq $8, %rsp popq %rbx popq %r12 + popq %r13 popq %r14 popq %r15 popq %rbp - ret -.Ltmp16: - .size print_array, .Ltmp16-print_array + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end1: + .size print_array, .Lfunc_end1-print_array .cfi_endproc - - .section .rodata.cst8,"aM",@progbits,8 - .align 8 -.LCPI2_0: - .quad 4602678819172646912 # double 0.5 - .text - .globl main - .align 16, 0x90 + # -- End function + .globl main # -- Begin function main + .p2align 4, 0x90 .type main,@function main: # @main .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp20: .cfi_def_cfa_offset 16 -.Ltmp21: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp22: .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbx - subq $56, %rsp -.Ltmp23: + subq $264, %rsp # imm = 0x108 .cfi_offset %rbx, -56 -.Ltmp24: .cfi_offset %r12, -48 -.Ltmp25: .cfi_offset %r13, -40 -.Ltmp26: .cfi_offset %r14, -32 -.Ltmp27: .cfi_offset %r15, -24 - xorl %ebx, %ebx - vmovsd .LCPI2_0(%rip), %xmm0 - .align 16, 0x90 -.LBB2_1: # %polly.loop_preheader3.i - # =>This Loop Header: Depth=1 - # Child Loop BB2_2 Depth 2 - xorl %ecx, %ecx - .align 16, 0x90 -.LBB2_2: # %polly.loop_header2.i - # Parent Loop BB2_1 Depth=1 - # => This Inner Loop Header: Depth=2 - movl %ecx, %edx - imull %ebx, %edx - movl %edx, %esi - sarl $31, %esi - shrl $22, %esi - addl %edx, %esi - andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 - negl %esi - movq %rbx, %rax - shlq $11, %rax - leal 1(%rdx,%rsi), %edi - leaq (%rax,%rax,2), %rsi - leaq 1(%rcx), %rdx - cmpq $1536, %rdx # imm = 0x600 - vcvtsi2sdl %edi, %xmm0, %xmm1 - vmulsd %xmm0, %xmm1, %xmm1 - vcvtsd2ss %xmm1, %xmm1, %xmm1 - vmovss %xmm1, A(%rsi,%rcx,4) - vmovss %xmm1, B(%rsi,%rcx,4) - movq %rdx, %rcx - jne .LBB2_2 -# BB#3: # %polly.loop_exit4.i - # in Loop: Header=BB2_1 Depth=1 - incq %rbx - cmpq $1536, %rbx # imm = 0x600 - jne .LBB2_1 -# BB#4: # %polly.loop_preheader3.preheader - movl $C, %edi + callq init_array + leaq C(%rip), %rdi + xorl %eax, %eax + movq %rax, -48(%rbp) # 8-byte Spill xorl %esi, %esi movl $9437184, %edx # imm = 0x900000 - callq memset - xorl %esi, %esi - movl $C+16, %eax - movq %rax, -88(%rbp) # 8-byte Spill - .align 16, 0x90 -.LBB2_5: # %polly.loop_preheader17 - # =>This Loop Header: Depth=1 - # Child Loop BB2_15 Depth 2 - # Child Loop BB2_8 Depth 3 - # Child Loop BB2_11 Depth 4 - # Child Loop BB2_17 Depth 5 - # Child Loop BB2_18 Depth 6 - movq %rsi, -56(%rbp) # 8-byte Spill - movq %rsi, %rax - orq $63, %rax + callq memset@PLT + movl $64, %eax + movq %rax, -80(%rbp) # 8-byte Spill + leaq A(%rip), %rax movq %rax, -72(%rbp) # 8-byte Spill - leaq -1(%rax), %rax - movq %rax, -48(%rbp) # 8-byte Spill - xorl %edx, %edx - .align 16, 0x90 -.LBB2_15: # %polly.loop_preheader24 - # Parent Loop BB2_5 Depth=1 + .p2align 4, 0x90 +.LBB2_1: # %polly.loop_header8 + # =>This Loop Header: Depth=1 + # Child Loop BB2_2 Depth 2 + # Child Loop BB2_3 Depth 3 + # Child Loop BB2_4 Depth 4 + # Child Loop BB2_5 Depth 5 + leaq B+192(%rip), %r9 + xorl %edi, %edi + xorl %eax, %eax + .p2align 4, 0x90 +.LBB2_2: # %polly.loop_header14 + # Parent Loop BB2_1 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB2_8 Depth 3 - # Child Loop BB2_11 Depth 4 - # Child Loop BB2_17 Depth 5 - # Child Loop BB2_18 Depth 6 - movq %rdx, -80(%rbp) # 8-byte Spill - leaq -4(%rdx), %rcx - movq %rdx, %rax - decq %rax - cmovsq %rcx, %rax - movq %rax, %r15 - sarq $63, %r15 - shrq $62, %r15 - addq %rax, %r15 - andq $-4, %r15 - movq %rdx, %r13 - orq $63, %r13 - leaq -4(%r13), %rdx - xorl %r10d, %r10d - movq -88(%rbp), %rax # 8-byte Reload - leaq (%rax,%r15,4), %rax - movq %rax, -64(%rbp) # 8-byte Spill - leaq B+16(,%r15,4), %rbx - leaq 4(%r15), %r12 - .align 16, 0x90 -.LBB2_8: # %polly.loop_header23 - # Parent Loop BB2_5 Depth=1 - # Parent Loop BB2_15 Depth=2 + # Child Loop BB2_3 Depth 3 + # Child Loop BB2_4 Depth 4 + # Child Loop BB2_5 Depth 5 + movq %rax, -168(%rbp) # 8-byte Spill + movq %rdi, -176(%rbp) # 8-byte Spill + shlq $6, %rdi + leaq 16(%rdi), %rdx + leaq 32(%rdi), %rsi + leaq 48(%rdi), %rcx + movq -72(%rbp), %r12 # 8-byte Reload + movq %r9, -184(%rbp) # 8-byte Spill + xorl %eax, %eax + .p2align 4, 0x90 +.LBB2_3: # %polly.loop_header20 + # Parent Loop BB2_1 Depth=1 + # Parent Loop BB2_2 Depth=2 # => This Loop Header: Depth=3 - # Child Loop BB2_11 Depth 4 - # Child Loop BB2_17 Depth 5 - # Child Loop BB2_18 Depth 6 - cmpq -72(%rbp), %rsi # 8-byte Folded Reload - jg .LBB2_13 -# BB#9: # %polly.loop_header30.preheader - # in Loop: Header=BB2_8 Depth=3 - movq %r10, %rax - orq $63, %rax - cmpq %rax, %r10 - jg .LBB2_13 -# BB#10: # in Loop: Header=BB2_8 Depth=3 - decq %rax - movq -64(%rbp), %r14 # 8-byte Reload - movq -56(%rbp), %r11 # 8-byte Reload - .align 16, 0x90 -.LBB2_11: # %polly.loop_header37.preheader - # Parent Loop BB2_5 Depth=1 - # Parent Loop BB2_15 Depth=2 - # Parent Loop BB2_8 Depth=3 + # Child Loop BB2_4 Depth 4 + # Child Loop BB2_5 Depth 5 + movq %rax, -192(%rbp) # 8-byte Spill + movq %r12, -200(%rbp) # 8-byte Spill + movq -48(%rbp), %r14 # 8-byte Reload + .p2align 4, 0x90 +.LBB2_4: # %polly.loop_header26 + # Parent Loop BB2_1 Depth=1 + # Parent Loop BB2_2 Depth=2 + # Parent Loop BB2_3 Depth=3 # => This Loop Header: Depth=4 - # Child Loop BB2_17 Depth 5 - # Child Loop BB2_18 Depth 6 - cmpq %r13, %r12 - movq %rbx, %r8 - movq %r10, %rsi - jg .LBB2_12 - .align 16, 0x90 -.LBB2_17: # %polly.loop_header46.preheader - # Parent Loop BB2_5 Depth=1 - # Parent Loop BB2_15 Depth=2 - # Parent Loop BB2_8 Depth=3 - # Parent Loop BB2_11 Depth=4 - # => This Loop Header: Depth=5 - # Child Loop BB2_18 Depth 6 - leaq (%r11,%r11,2), %rcx - shlq $11, %rcx - vbroadcastss A(%rcx,%rsi,4), %xmm0 - movq %r14, %rdi - movq %r8, %r9 - movq %r15, %rcx -.LBB2_18: # %polly.loop_header46 - # Parent Loop BB2_5 Depth=1 - # Parent Loop BB2_15 Depth=2 - # Parent Loop BB2_8 Depth=3 - # Parent Loop BB2_11 Depth=4 - # Parent Loop BB2_17 Depth=5 - # => This Inner Loop Header: Depth=6 - vmulps (%r9), %xmm0, %xmm1 - vaddps (%rdi), %xmm1, %xmm1 - vmovaps %xmm1, (%rdi) - addq $16, %rdi - addq $16, %r9 - addq $4, %rcx - cmpq %rdx, %rcx - jle .LBB2_18 -# BB#16: # %polly.loop_exit48 - # in Loop: Header=BB2_17 Depth=5 - addq $6144, %r8 # imm = 0x1800 - cmpq %rax, %rsi - leaq 1(%rsi), %rsi - jle .LBB2_17 - .align 16, 0x90 -.LBB2_12: # %polly.loop_exit39 - # in Loop: Header=BB2_11 Depth=4 - addq $6144, %r14 # imm = 0x1800 - cmpq -48(%rbp), %r11 # 8-byte Folded Reload - leaq 1(%r11), %r11 - jle .LBB2_11 - .align 16, 0x90 -.LBB2_13: # %polly.loop_exit32 - # in Loop: Header=BB2_8 Depth=3 - addq $393216, %rbx # imm = 0x60000 - cmpq $1472, %r10 # imm = 0x5C0 - leaq 64(%r10), %r10 - movq -56(%rbp), %rsi # 8-byte Reload - jl .LBB2_8 -# BB#14: # %polly.loop_exit25 - # in Loop: Header=BB2_15 Depth=2 - movq -80(%rbp), %rdx # 8-byte Reload - cmpq $1472, %rdx # imm = 0x5C0 - leaq 64(%rdx), %rdx - jl .LBB2_15 -# BB#6: # %polly.loop_exit18 - # in Loop: Header=BB2_5 Depth=1 - addq $393216, -88(%rbp) # 8-byte Folded Spill + # Child Loop BB2_5 Depth 5 + leaq (%r14,%r14,2), %rbx + shlq $11, %rbx + leaq C(%rip), %rax + addq %rax, %rbx + leaq (%rbx,%rdi,4), %r8 + leaq (%rbx,%rdx,4), %r15 + leaq (%rbx,%rsi,4), %r10 + leaq (%rbx,%rcx,4), %r11 + movups (%rbx,%rdi,4), %xmm8 + movups 16(%rbx,%rdi,4), %xmm0 + movaps %xmm0, -144(%rbp) # 16-byte Spill + movups 32(%rbx,%rdi,4), %xmm6 + movups 48(%rbx,%rdi,4), %xmm1 + movups (%rbx,%rdx,4), %xmm15 + movups 16(%rbx,%rdx,4), %xmm0 + movaps %xmm0, -64(%rbp) # 16-byte Spill + movups 32(%rbx,%rdx,4), %xmm0 + movaps %xmm0, -96(%rbp) # 16-byte Spill + movups 48(%rbx,%rdx,4), %xmm0 + movaps %xmm0, -112(%rbp) # 16-byte Spill + movups (%rbx,%rsi,4), %xmm11 + movups 16(%rbx,%rsi,4), %xmm0 + movaps %xmm0, -160(%rbp) # 16-byte Spill + movups 32(%rbx,%rsi,4), %xmm12 + movups 48(%rbx,%rsi,4), %xmm0 + movaps %xmm0, -128(%rbp) # 16-byte Spill + movups (%rbx,%rcx,4), %xmm9 + movups 16(%rbx,%rcx,4), %xmm13 + movups 32(%rbx,%rcx,4), %xmm2 + movups 48(%rbx,%rcx,4), %xmm3 + movq %r9, %rbx + movl $0, %r13d + .p2align 4, 0x90 +.LBB2_5: # %vector.ph + # Parent Loop BB2_1 Depth=1 + # Parent Loop BB2_2 Depth=2 + # Parent Loop BB2_3 Depth=3 + # Parent Loop BB2_4 Depth=4 + # => This Inner Loop Header: Depth=5 + movaps %xmm12, -240(%rbp) # 16-byte Spill + movaps %xmm2, -256(%rbp) # 16-byte Spill + movaps %xmm3, -272(%rbp) # 16-byte Spill + movaps %xmm8, %xmm10 + movaps -144(%rbp), %xmm7 # 16-byte Reload + unpcklps %xmm7, %xmm10 # xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] + movaps %xmm1, %xmm4 + shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] + shufps $36, %xmm4, %xmm10 # xmm10 = xmm10[0,1],xmm4[2,0] + movaps %xmm7, %xmm5 + shufps $17, %xmm8, %xmm5 # xmm5 = xmm5[1,0],xmm8[1,0] + movaps %xmm6, %xmm4 + unpcklps %xmm1, %xmm4 # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] + shufps $226, %xmm4, %xmm5 # xmm5 = xmm5[2,0],xmm4[2,3] + movaps %xmm8, %xmm12 + unpckhps %xmm7, %xmm12 # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] + movaps %xmm1, %xmm4 + shufps $34, %xmm6, %xmm4 # xmm4 = xmm4[2,0],xmm6[2,0] + shufps $36, %xmm4, %xmm12 # xmm12 = xmm12[0,1],xmm4[2,0] + shufps $51, %xmm8, %xmm7 # xmm7 = xmm7[3,0],xmm8[3,0] + unpckhps %xmm1, %xmm6 # xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] + shufps $226, %xmm6, %xmm7 # xmm7 = xmm7[2,0],xmm6[2,3] + movaps -160(%rbx), %xmm0 + movaps -144(%rbx), %xmm1 + movaps %xmm1, %xmm6 + shufps $0, %xmm0, %xmm6 # xmm6 = xmm6[0,0],xmm0[0,0] + movaps -192(%rbx), %xmm3 + movaps -176(%rbx), %xmm4 + movaps %xmm3, %xmm8 + unpcklps %xmm4, %xmm8 # xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] + shufps $36, %xmm6, %xmm8 # xmm8 = xmm8[0,1],xmm6[2,0] + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] + movaps %xmm4, %xmm6 + shufps $17, %xmm3, %xmm6 # xmm6 = xmm6[1,0],xmm3[1,0] + shufps $226, %xmm2, %xmm6 # xmm6 = xmm6[2,0],xmm2[2,3] + movaps %xmm1, %xmm2 + shufps $34, %xmm0, %xmm2 # xmm2 = xmm2[2,0],xmm0[2,0] + movaps %xmm3, %xmm14 + unpckhps %xmm4, %xmm14 # xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] + shufps $36, %xmm2, %xmm14 # xmm14 = xmm14[0,1],xmm2[2,0] + unpckhps %xmm1, %xmm0 # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] + shufps $51, %xmm3, %xmm4 # xmm4 = xmm4[3,0],xmm3[3,0] + shufps $226, %xmm0, %xmm4 # xmm4 = xmm4[2,0],xmm0[2,3] + movss (%r12,%r13,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + shufps $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] + mulps %xmm0, %xmm8 + addps %xmm10, %xmm8 + mulps %xmm0, %xmm6 + addps %xmm5, %xmm6 + mulps %xmm0, %xmm14 + addps %xmm12, %xmm14 + mulps %xmm0, %xmm4 + movaps %xmm0, %xmm5 + addps %xmm7, %xmm4 + movaps %xmm14, %xmm0 + unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] + movaps %xmm6, %xmm1 + shufps $51, %xmm8, %xmm1 # xmm1 = xmm1[3,0],xmm8[3,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, -304(%rbp) # 16-byte Spill + movaps %xmm4, %xmm0 + shufps $34, %xmm14, %xmm0 # xmm0 = xmm0[2,0],xmm14[2,0] + movaps %xmm8, %xmm1 + unpckhps %xmm6, %xmm1 # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] + shufps $36, %xmm0, %xmm1 # xmm1 = xmm1[0,1],xmm0[2,0] + movaps %xmm1, -288(%rbp) # 16-byte Spill + movaps %xmm14, %xmm0 + unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] + movaps %xmm6, %xmm1 + shufps $17, %xmm8, %xmm1 # xmm1 = xmm1[1,0],xmm8[1,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, -144(%rbp) # 16-byte Spill + shufps $0, %xmm14, %xmm4 # xmm4 = xmm4[0,0],xmm14[0,0] + unpcklps %xmm6, %xmm8 # xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] + shufps $36, %xmm4, %xmm8 # xmm8 = xmm8[0,1],xmm4[2,0] + movaps %xmm15, %xmm14 + movaps -64(%rbp), %xmm4 # 16-byte Reload + unpcklps %xmm4, %xmm14 # xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] + movaps -112(%rbp), %xmm1 # 16-byte Reload + movaps %xmm1, %xmm0 + movaps -96(%rbp), %xmm3 # 16-byte Reload + shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] + shufps $36, %xmm0, %xmm14 # xmm14 = xmm14[0,1],xmm0[2,0] + movaps %xmm4, %xmm12 + shufps $17, %xmm15, %xmm12 # xmm12 = xmm12[1,0],xmm15[1,0] + movaps %xmm3, %xmm2 + unpcklps %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] + shufps $226, %xmm2, %xmm12 # xmm12 = xmm12[2,0],xmm2[2,3] + movaps %xmm15, %xmm7 + unpckhps %xmm4, %xmm7 # xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] + movaps %xmm1, %xmm2 + shufps $34, %xmm3, %xmm2 # xmm2 = xmm2[2,0],xmm3[2,0] + shufps $36, %xmm2, %xmm7 # xmm7 = xmm7[0,1],xmm2[2,0] + shufps $51, %xmm15, %xmm4 # xmm4 = xmm4[3,0],xmm15[3,0] + unpckhps %xmm1, %xmm3 # xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] + shufps $226, %xmm3, %xmm4 # xmm4 = xmm4[2,0],xmm3[2,3] + movaps %xmm4, -64(%rbp) # 16-byte Spill + movaps -96(%rbx), %xmm2 + movaps -80(%rbx), %xmm1 + movaps %xmm1, %xmm4 + shufps $0, %xmm2, %xmm4 # xmm4 = xmm4[0,0],xmm2[0,0] + movaps -112(%rbx), %xmm10 + movaps -128(%rbx), %xmm0 + movaps %xmm0, %xmm15 + unpcklps %xmm10, %xmm15 # xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] + shufps $36, %xmm4, %xmm15 # xmm15 = xmm15[0,1],xmm4[2,0] + movaps %xmm2, %xmm4 + unpcklps %xmm1, %xmm4 # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] + movaps %xmm10, %xmm6 + shufps $17, %xmm0, %xmm6 # xmm6 = xmm6[1,0],xmm0[1,0] + shufps $226, %xmm4, %xmm6 # xmm6 = xmm6[2,0],xmm4[2,3] + movaps %xmm1, %xmm3 + shufps $34, %xmm2, %xmm3 # xmm3 = xmm3[2,0],xmm2[2,0] + movaps %xmm0, %xmm4 + unpckhps %xmm10, %xmm4 # xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] + shufps $36, %xmm3, %xmm4 # xmm4 = xmm4[0,1],xmm3[2,0] + unpckhps %xmm1, %xmm2 # xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] + shufps $51, %xmm0, %xmm10 # xmm10 = xmm10[3,0],xmm0[3,0] + shufps $226, %xmm2, %xmm10 # xmm10 = xmm10[2,0],xmm2[2,3] + movaps %xmm5, -224(%rbp) # 16-byte Spill + mulps %xmm5, %xmm15 + addps %xmm14, %xmm15 + mulps %xmm5, %xmm6 + addps %xmm12, %xmm6 + mulps %xmm5, %xmm4 + addps %xmm7, %xmm4 + mulps %xmm5, %xmm10 + addps -64(%rbp), %xmm10 # 16-byte Folded Reload + movaps %xmm4, %xmm0 + unpckhps %xmm10, %xmm0 # xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] + movaps %xmm6, %xmm1 + shufps $51, %xmm15, %xmm1 # xmm1 = xmm1[3,0],xmm15[3,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, -112(%rbp) # 16-byte Spill + movaps %xmm10, %xmm0 + shufps $34, %xmm4, %xmm0 # xmm0 = xmm0[2,0],xmm4[2,0] + movaps %xmm15, %xmm1 + unpckhps %xmm6, %xmm1 # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] + shufps $36, %xmm0, %xmm1 # xmm1 = xmm1[0,1],xmm0[2,0] + movaps %xmm1, -96(%rbp) # 16-byte Spill + movaps %xmm4, %xmm0 + unpcklps %xmm10, %xmm0 # xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] + movaps %xmm6, %xmm1 + shufps $17, %xmm15, %xmm1 # xmm1 = xmm1[1,0],xmm15[1,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, -64(%rbp) # 16-byte Spill + shufps $0, %xmm4, %xmm10 # xmm10 = xmm10[0,0],xmm4[0,0] + unpcklps %xmm6, %xmm15 # xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] + shufps $36, %xmm10, %xmm15 # xmm15 = xmm15[0,1],xmm10[2,0] + movaps %xmm11, %xmm10 + movaps -160(%rbp), %xmm14 # 16-byte Reload + unpcklps %xmm14, %xmm10 # xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] + movaps -128(%rbp), %xmm2 # 16-byte Reload + movaps %xmm2, %xmm0 + movaps -240(%rbp), %xmm3 # 16-byte Reload + shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] + shufps $36, %xmm0, %xmm10 # xmm10 = xmm10[0,1],xmm0[2,0] + movaps %xmm14, %xmm12 + shufps $17, %xmm11, %xmm12 # xmm12 = xmm12[1,0],xmm11[1,0] + movaps %xmm3, %xmm0 + unpcklps %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] + shufps $226, %xmm0, %xmm12 # xmm12 = xmm12[2,0],xmm0[2,3] + movaps %xmm11, %xmm0 + unpckhps %xmm14, %xmm0 # xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] + movaps %xmm2, %xmm1 + shufps $34, %xmm3, %xmm1 # xmm1 = xmm1[2,0],xmm3[2,0] + shufps $36, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[2,0] + shufps $51, %xmm11, %xmm14 # xmm14 = xmm14[3,0],xmm11[3,0] + unpckhps %xmm2, %xmm3 # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] + shufps $226, %xmm3, %xmm14 # xmm14 = xmm14[2,0],xmm3[2,3] + movaps -32(%rbx), %xmm1 + movaps -16(%rbx), %xmm2 + movaps %xmm2, %xmm3 + shufps $0, %xmm1, %xmm3 # xmm3 = xmm3[0,0],xmm1[0,0] + movaps -48(%rbx), %xmm4 + movaps -64(%rbx), %xmm5 + movaps %xmm5, %xmm11 + unpcklps %xmm4, %xmm11 # xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] + shufps $36, %xmm3, %xmm11 # xmm11 = xmm11[0,1],xmm3[2,0] + movaps %xmm1, %xmm3 + unpcklps %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] + movaps %xmm4, %xmm7 + shufps $17, %xmm5, %xmm7 # xmm7 = xmm7[1,0],xmm5[1,0] + shufps $226, %xmm3, %xmm7 # xmm7 = xmm7[2,0],xmm3[2,3] + movaps %xmm2, %xmm3 + shufps $34, %xmm1, %xmm3 # xmm3 = xmm3[2,0],xmm1[2,0] + movaps %xmm5, %xmm6 + unpckhps %xmm4, %xmm6 # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] + shufps $36, %xmm3, %xmm6 # xmm6 = xmm6[0,1],xmm3[2,0] + unpckhps %xmm2, %xmm1 # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] + shufps $51, %xmm5, %xmm4 # xmm4 = xmm4[3,0],xmm5[3,0] + shufps $226, %xmm1, %xmm4 # xmm4 = xmm4[2,0],xmm1[2,3] + movaps -224(%rbp), %xmm1 # 16-byte Reload + mulps %xmm1, %xmm11 + addps %xmm10, %xmm11 + mulps %xmm1, %xmm7 + addps %xmm12, %xmm7 + mulps %xmm1, %xmm6 + addps %xmm0, %xmm6 + mulps %xmm1, %xmm4 + addps %xmm14, %xmm4 + movaps %xmm6, %xmm0 + unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] + movaps %xmm7, %xmm1 + shufps $51, %xmm11, %xmm1 # xmm1 = xmm1[3,0],xmm11[3,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, -128(%rbp) # 16-byte Spill + movaps %xmm4, %xmm0 + shufps $34, %xmm6, %xmm0 # xmm0 = xmm0[2,0],xmm6[2,0] + movaps %xmm11, %xmm12 + unpckhps %xmm7, %xmm12 # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] + shufps $36, %xmm0, %xmm12 # xmm12 = xmm12[0,1],xmm0[2,0] + movaps %xmm6, %xmm0 + unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] + movaps %xmm7, %xmm1 + shufps $17, %xmm11, %xmm1 # xmm1 = xmm1[1,0],xmm11[1,0] + shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] + movaps %xmm1, -160(%rbp) # 16-byte Spill + shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] + unpcklps %xmm7, %xmm11 # xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] + shufps $36, %xmm4, %xmm11 # xmm11 = xmm11[0,1],xmm4[2,0] + movaps %xmm9, %xmm10 + unpcklps %xmm13, %xmm10 # xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] + movaps -272(%rbp), %xmm2 # 16-byte Reload + movaps %xmm2, %xmm0 + movaps -256(%rbp), %xmm3 # 16-byte Reload + shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] + shufps $36, %xmm0, %xmm10 # xmm10 = xmm10[0,1],xmm0[2,0] + movaps %xmm13, %xmm14 + shufps $17, %xmm9, %xmm14 # xmm14 = xmm14[1,0],xmm9[1,0] + movaps %xmm3, %xmm0 + unpcklps %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] + shufps $226, %xmm0, %xmm14 # xmm14 = xmm14[2,0],xmm0[2,3] + movaps %xmm9, %xmm0 + unpckhps %xmm13, %xmm0 # xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] + movaps %xmm2, %xmm1 + shufps $34, %xmm3, %xmm1 # xmm1 = xmm1[2,0],xmm3[2,0] + shufps $36, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[2,0] + shufps $51, %xmm9, %xmm13 # xmm13 = xmm13[3,0],xmm9[3,0] + unpckhps %xmm2, %xmm3 # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] + shufps $226, %xmm3, %xmm13 # xmm13 = xmm13[2,0],xmm3[2,3] + movaps 32(%rbx), %xmm1 + movaps 48(%rbx), %xmm2 + movaps %xmm2, %xmm3 + shufps $0, %xmm1, %xmm3 # xmm3 = xmm3[0,0],xmm1[0,0] + movaps 16(%rbx), %xmm4 + movaps (%rbx), %xmm5 + movaps %xmm5, %xmm9 + unpcklps %xmm4, %xmm9 # xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] + shufps $36, %xmm3, %xmm9 # xmm9 = xmm9[0,1],xmm3[2,0] + movaps %xmm1, %xmm3 + unpcklps %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] + movaps %xmm4, %xmm7 + shufps $17, %xmm5, %xmm7 # xmm7 = xmm7[1,0],xmm5[1,0] + shufps $226, %xmm3, %xmm7 # xmm7 = xmm7[2,0],xmm3[2,3] + movaps %xmm2, %xmm3 + shufps $34, %xmm1, %xmm3 # xmm3 = xmm3[2,0],xmm1[2,0] + movaps %xmm5, %xmm6 + unpckhps %xmm4, %xmm6 # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] + shufps $36, %xmm3, %xmm6 # xmm6 = xmm6[0,1],xmm3[2,0] + unpckhps %xmm2, %xmm1 # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] + shufps $51, %xmm5, %xmm4 # xmm4 = xmm4[3,0],xmm5[3,0] + shufps $226, %xmm1, %xmm4 # xmm4 = xmm4[2,0],xmm1[2,3] + movaps -224(%rbp), %xmm1 # 16-byte Reload + mulps %xmm1, %xmm9 + addps %xmm10, %xmm9 + mulps %xmm1, %xmm7 + addps %xmm14, %xmm7 + mulps %xmm1, %xmm6 + addps %xmm0, %xmm6 + mulps %xmm1, %xmm4 + addps %xmm13, %xmm4 + movaps %xmm6, %xmm0 + unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] + movaps %xmm7, %xmm3 + shufps $51, %xmm9, %xmm3 # xmm3 = xmm3[3,0],xmm9[3,0] + shufps $226, %xmm0, %xmm3 # xmm3 = xmm3[2,0],xmm0[2,3] + movaps %xmm4, %xmm0 + shufps $34, %xmm6, %xmm0 # xmm0 = xmm0[2,0],xmm6[2,0] + movaps %xmm9, %xmm2 + unpckhps %xmm7, %xmm2 # xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] + shufps $36, %xmm0, %xmm2 # xmm2 = xmm2[0,1],xmm0[2,0] + movaps %xmm6, %xmm0 + unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] + movaps %xmm7, %xmm13 + shufps $17, %xmm9, %xmm13 # xmm13 = xmm13[1,0],xmm9[1,0] + shufps $226, %xmm0, %xmm13 # xmm13 = xmm13[2,0],xmm0[2,3] + shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] + movaps -288(%rbp), %xmm6 # 16-byte Reload + movaps -304(%rbp), %xmm1 # 16-byte Reload + unpcklps %xmm7, %xmm9 # xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] + shufps $36, %xmm4, %xmm9 # xmm9 = xmm9[0,1],xmm4[2,0] + addq $1, %r13 + addq $6144, %rbx # imm = 0x1800 + cmpq $64, %r13 + jne .LBB2_5 +# %bb.6: # %polly.loop_exit34 + # in Loop: Header=BB2_4 Depth=4 + movups %xmm8, (%r8) + movaps -144(%rbp), %xmm0 # 16-byte Reload + movups %xmm0, 16(%r8) + movups %xmm6, 32(%r8) + movups %xmm1, 48(%r8) + movaps -112(%rbp), %xmm0 # 16-byte Reload + movups %xmm0, 48(%r15) + movaps -96(%rbp), %xmm0 # 16-byte Reload + movups %xmm0, 32(%r15) + movaps -64(%rbp), %xmm0 # 16-byte Reload + movups %xmm0, 16(%r15) + movups %xmm15, (%r15) + movaps -128(%rbp), %xmm0 # 16-byte Reload + movups %xmm0, 48(%r10) + movaps -160(%rbp), %xmm0 # 16-byte Reload + movups %xmm0, 16(%r10) + movups %xmm11, (%r10) + movups %xmm12, 32(%r10) + movups %xmm3, 48(%r11) + movups %xmm13, 16(%r11) + movups %xmm9, (%r11) + movups %xmm2, 32(%r11) + addq $1, %r14 + addq $6144, %r12 # imm = 0x1800 + cmpq -80(%rbp), %r14 # 8-byte Folded Reload + jne .LBB2_4 +# %bb.7: # %polly.loop_exit28 + # in Loop: Header=BB2_3 Depth=3 + movq -192(%rbp), %rax # 8-byte Reload + addq $64, %rax + addq $393216, %r9 # imm = 0x60000 + movq -200(%rbp), %r12 # 8-byte Reload + addq $256, %r12 # imm = 0x100 + cmpq $1536, %rax # imm = 0x600 + jb .LBB2_3 +# %bb.8: # %polly.loop_exit22 + # in Loop: Header=BB2_2 Depth=2 + movq -168(%rbp), %rax # 8-byte Reload + addq $64, %rax + movq -176(%rbp), %rdi # 8-byte Reload + addq $1, %rdi + movq -184(%rbp), %r9 # 8-byte Reload + addq $256, %r9 # imm = 0x100 + cmpq $1536, %rax # imm = 0x600 + jb .LBB2_2 +# %bb.9: # %polly.loop_exit16 + # in Loop: Header=BB2_1 Depth=1 + movq -48(%rbp), %rax # 8-byte Reload + movq %rax, %rcx + addq $64, %rcx + addq $64, -80(%rbp) # 8-byte Folded Spill + addq $393216, -72(%rbp) # 8-byte Folded Spill # imm = 0x60000 - cmpq $1472, %rsi # imm = 0x5C0 - leaq 64(%rsi), %rsi - jl .LBB2_5 -# BB#7: # %polly.loop_exit11 + movq %rcx, %rax + movq %rcx, -48(%rbp) # 8-byte Spill + cmpq $1536, %rcx # imm = 0x600 + jb .LBB2_1 +# %bb.10: # %polly.exiting xorl %eax, %eax - addq $56, %rsp + addq $264, %rsp # imm = 0x108 popq %rbx popq %r12 popq %r13 popq %r14 popq %r15 popq %rbp - ret -.Ltmp28: - .size main, .Ltmp28-main + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end2: + .size main, .Lfunc_end2-main .cfi_endproc - + # -- End function .type A,@object # @A .comm A,9437184,16 .type B,@object # @B @@ -387,10 +647,11 @@ main: # @main .type .L.str,@object # @.str .section .rodata.str1.1,"aMS",@progbits,1 .L.str: - .asciz "%lf " + .asciz "%lf " .size .L.str, 5 .type C,@object # @C .comm C,9437184,16 + .ident "clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)" .section ".note.GNU-stack","",@progbits diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.exe b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.exe Binary files differdeleted file mode 100755 index fbd8b128fd8..00000000000 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.exe +++ /dev/null diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll Binary files differindex acdd95f3bc4..10ea4c8b55f 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.s b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.s index f7ab7fdd59c..bf25833eec1 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.s +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.s @@ -1,379 +1,495 @@ - .file "matmul.polly.interchanged+tiled.ll" + .text + .file "matmul.c" .section .rodata.cst8,"aM",@progbits,8 - .align 8 + .p2align 3 # -- Begin function init_array .LCPI0_0: .quad 4602678819172646912 # double 0.5 .text .globl init_array - .align 16, 0x90 + .p2align 4, 0x90 .type init_array,@function init_array: # @init_array .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp2: .cfi_def_cfa_offset 16 -.Ltmp3: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp4: .cfi_def_cfa_register %rbp + leaq B(%rip), %rax + leaq A(%rip), %rcx xorl %r8d, %r8d - vmovsd .LCPI0_0(%rip), %xmm0 - .align 16, 0x90 -.LBB0_1: # %polly.loop_preheader3 + movsd .LCPI0_0(%rip), %xmm0 # xmm0 = mem[0],zero + xorl %r9d, %r9d + .p2align 4, 0x90 +.LBB0_1: # %polly.loop_header # =>This Loop Header: Depth=1 # Child Loop BB0_2 Depth 2 - xorl %ecx, %ecx - .align 16, 0x90 -.LBB0_2: # %polly.loop_header2 + movl $1, %edi + xorl %edx, %edx + .p2align 4, 0x90 +.LBB0_2: # %polly.loop_header1 # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 - movl %ecx, %edx - imull %r8d, %edx movl %edx, %esi - sarl $31, %esi - shrl $22, %esi - addl %edx, %esi - andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 - negl %esi - movq %r8, %rax - shlq $11, %rax - leal 1(%rdx,%rsi), %edi - leaq (%rax,%rax,2), %rsi - leaq 1(%rcx), %rdx - cmpq $1536, %rdx # imm = 0x600 - vcvtsi2sdl %edi, %xmm0, %xmm1 - vmulsd %xmm0, %xmm1, %xmm1 - vcvtsd2ss %xmm1, %xmm1, %xmm1 - vmovss %xmm1, A(%rsi,%rcx,4) - vmovss %xmm1, B(%rsi,%rcx,4) - movq %rdx, %rcx + andl $1022, %esi # imm = 0x3FE + orl $1, %esi + xorps %xmm1, %xmm1 + cvtsi2sdl %esi, %xmm1 + mulsd %xmm0, %xmm1 + cvtsd2ss %xmm1, %xmm1 + movss %xmm1, -4(%rcx,%rdi,4) + movss %xmm1, -4(%rax,%rdi,4) + leal (%r9,%rdx), %esi + andl $1023, %esi # imm = 0x3FF + addl $1, %esi + xorps %xmm1, %xmm1 + cvtsi2sdl %esi, %xmm1 + mulsd %xmm0, %xmm1 + cvtsd2ss %xmm1, %xmm1 + movss %xmm1, (%rcx,%rdi,4) + movss %xmm1, (%rax,%rdi,4) + addq $2, %rdi + addl %r8d, %edx + cmpq $1537, %rdi # imm = 0x601 jne .LBB0_2 -# BB#3: # %polly.loop_exit4 +# %bb.3: # %polly.loop_exit3 # in Loop: Header=BB0_1 Depth=1 - incq %r8 - cmpq $1536, %r8 # imm = 0x600 + addq $1, %r9 + addq $6144, %rax # imm = 0x1800 + addq $6144, %rcx # imm = 0x1800 + addl $2, %r8d + cmpq $1536, %r9 # imm = 0x600 jne .LBB0_1 -# BB#4: # %polly.loop_exit +# %bb.4: # %polly.exiting popq %rbp - ret -.Ltmp5: - .size init_array, .Ltmp5-init_array + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end0: + .size init_array, .Lfunc_end0-init_array .cfi_endproc - - .globl print_array - .align 16, 0x90 + # -- End function + .globl print_array # -- Begin function print_array + .p2align 4, 0x90 .type print_array,@function print_array: # @print_array .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp9: .cfi_def_cfa_offset 16 -.Ltmp10: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp11: .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 + pushq %r13 pushq %r12 pushq %rbx -.Ltmp12: - .cfi_offset %rbx, -48 -.Ltmp13: - .cfi_offset %r12, -40 -.Ltmp14: + pushq %rax + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 .cfi_offset %r14, -32 -.Ltmp15: .cfi_offset %r15, -24 - xorl %r14d, %r14d - movl $C, %r15d - .align 16, 0x90 + leaq C(%rip), %r13 + xorl %eax, %eax + movl $3435973837, %r12d # imm = 0xCCCCCCCD + leaq .L.str(%rip), %r14 + .p2align 4, 0x90 .LBB1_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB1_2 Depth 2 - movq stdout(%rip), %rax - movq %r15, %r12 + movq %rax, -48(%rbp) # 8-byte Spill + movq stdout(%rip), %rsi xorl %ebx, %ebx - .align 16, 0x90 + .p2align 4, 0x90 .LBB1_2: # %for.body3 # Parent Loop BB1_1 Depth=1 # => This Inner Loop Header: Depth=2 - vmovss (%r12), %xmm0 - vcvtss2sd %xmm0, %xmm0, %xmm0 - movq %rax, %rdi - movl $.L.str, %esi + movl %ebx, %eax + imulq %r12, %rax + shrq $38, %rax + leal (%rax,%rax,4), %r15d + shll $4, %r15d + addl $79, %r15d + movss (%r13,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + cvtss2sd %xmm0, %xmm0 movb $1, %al + movq %rsi, %rdi + movq %r14, %rsi callq fprintf - movslq %ebx, %rax - imulq $1717986919, %rax, %rcx # imm = 0x66666667 - movq %rcx, %rdx - shrq $63, %rdx - sarq $37, %rcx - addl %edx, %ecx - imull $80, %ecx, %ecx - subl %ecx, %eax - cmpl $79, %eax + cmpl %ebx, %r15d jne .LBB1_4 -# BB#3: # %if.then +# %bb.3: # %if.then # in Loop: Header=BB1_2 Depth=2 movq stdout(%rip), %rsi movl $10, %edi - callq fputc + callq fputc@PLT .LBB1_4: # %for.inc # in Loop: Header=BB1_2 Depth=2 - addq $4, %r12 - incq %rbx - movq stdout(%rip), %rax + addq $1, %rbx + movq stdout(%rip), %rsi cmpq $1536, %rbx # imm = 0x600 jne .LBB1_2 -# BB#5: # %for.end +# %bb.5: # %for.end # in Loop: Header=BB1_1 Depth=1 movl $10, %edi - movq %rax, %rsi - callq fputc - addq $6144, %r15 # imm = 0x1800 - incq %r14 - cmpq $1536, %r14 # imm = 0x600 + callq fputc@PLT + movq -48(%rbp), %rax # 8-byte Reload + addq $1, %rax + addq $6144, %r13 # imm = 0x1800 + cmpq $1536, %rax # imm = 0x600 jne .LBB1_1 -# BB#6: # %for.end12 +# %bb.6: # %for.end12 + addq $8, %rsp popq %rbx popq %r12 + popq %r13 popq %r14 popq %r15 popq %rbp - ret -.Ltmp16: - .size print_array, .Ltmp16-print_array + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end1: + .size print_array, .Lfunc_end1-print_array .cfi_endproc - - .section .rodata.cst8,"aM",@progbits,8 - .align 8 -.LCPI2_0: - .quad 4602678819172646912 # double 0.5 - .text - .globl main - .align 16, 0x90 + # -- End function + .globl main # -- Begin function main + .p2align 4, 0x90 .type main,@function main: # @main .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp20: .cfi_def_cfa_offset 16 -.Ltmp21: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp22: .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbx - subq $56, %rsp -.Ltmp23: + subq $344, %rsp # imm = 0x158 .cfi_offset %rbx, -56 -.Ltmp24: .cfi_offset %r12, -48 -.Ltmp25: .cfi_offset %r13, -40 -.Ltmp26: .cfi_offset %r14, -32 -.Ltmp27: .cfi_offset %r15, -24 - xorl %ebx, %ebx - vmovsd .LCPI2_0(%rip), %xmm0 - .align 16, 0x90 -.LBB2_1: # %polly.loop_preheader3.i - # =>This Loop Header: Depth=1 - # Child Loop BB2_2 Depth 2 - xorl %ecx, %ecx - .align 16, 0x90 -.LBB2_2: # %polly.loop_header2.i - # Parent Loop BB2_1 Depth=1 - # => This Inner Loop Header: Depth=2 - movl %ecx, %edx - imull %ebx, %edx - movl %edx, %esi - sarl $31, %esi - shrl $22, %esi - addl %edx, %esi - andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 - negl %esi - movq %rbx, %rax - shlq $11, %rax - leal 1(%rdx,%rsi), %edi - leaq (%rax,%rax,2), %rsi - leaq 1(%rcx), %rdx - cmpq $1536, %rdx # imm = 0x600 - vcvtsi2sdl %edi, %xmm0, %xmm1 - vmulsd %xmm0, %xmm1, %xmm1 - vcvtsd2ss %xmm1, %xmm1, %xmm1 - vmovss %xmm1, A(%rsi,%rcx,4) - vmovss %xmm1, B(%rsi,%rcx,4) - movq %rdx, %rcx - jne .LBB2_2 -# BB#3: # %polly.loop_exit4.i - # in Loop: Header=BB2_1 Depth=1 - incq %rbx - cmpq $1536, %rbx # imm = 0x600 - jne .LBB2_1 -# BB#4: # %polly.loop_preheader3.preheader - movl $C, %ebx - movl $C, %edi + callq init_array + leaq C(%rip), %rdi + xorl %eax, %eax + movq %rax, -48(%rbp) # 8-byte Spill xorl %esi, %esi movl $9437184, %edx # imm = 0x900000 - callq memset - xorl %eax, %eax - .align 16, 0x90 -.LBB2_5: # %polly.loop_preheader17 - # =>This Loop Header: Depth=1 - # Child Loop BB2_15 Depth 2 - # Child Loop BB2_8 Depth 3 - # Child Loop BB2_11 Depth 4 - # Child Loop BB2_17 Depth 5 - # Child Loop BB2_18 Depth 6 + callq memset@PLT + movl $64, %eax + movq %rax, -64(%rbp) # 8-byte Spill + leaq A(%rip), %rax movq %rax, -56(%rbp) # 8-byte Spill - movq %rbx, -88(%rbp) # 8-byte Spill - movq %rax, %rcx - orq $63, %rcx - movq %rcx, -72(%rbp) # 8-byte Spill - leaq -1(%rcx), %rcx - movq %rcx, -48(%rbp) # 8-byte Spill - movq $-1, %r15 - movl $B, %ecx - movq %rbx, -64(%rbp) # 8-byte Spill - xorl %r12d, %r12d - .align 16, 0x90 -.LBB2_15: # %polly.loop_preheader24 - # Parent Loop BB2_5 Depth=1 + .p2align 4, 0x90 +.LBB2_1: # %polly.loop_header8 + # =>This Loop Header: Depth=1 + # Child Loop BB2_2 Depth 2 + # Child Loop BB2_3 Depth 3 + # Child Loop BB2_4 Depth 4 + # Child Loop BB2_5 Depth 5 + leaq B+240(%rip), %rax + xorl %edi, %edi + .p2align 4, 0x90 +.LBB2_2: # %polly.loop_header14 + # Parent Loop BB2_1 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB2_8 Depth 3 - # Child Loop BB2_11 Depth 4 - # Child Loop BB2_17 Depth 5 - # Child Loop BB2_18 Depth 6 + # Child Loop BB2_3 Depth 3 + # Child Loop BB2_4 Depth 4 + # Child Loop BB2_5 Depth 5 + movq %rdi, %rcx + orq $4, %rcx movq %rcx, -80(%rbp) # 8-byte Spill - movq %r12, %r13 - orq $63, %r13 - leaq -1(%r13), %rbx - xorl %r9d, %r9d - movq %rcx, %rdx - .align 16, 0x90 -.LBB2_8: # %polly.loop_header23 - # Parent Loop BB2_5 Depth=1 - # Parent Loop BB2_15 Depth=2 + movq %rdi, %rcx + orq $8, %rcx + movq %rcx, -264(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $12, %rcx + movq %rcx, -256(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $16, %rcx + movq %rcx, -248(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $20, %rcx + movq %rcx, -240(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $24, %rcx + movq %rcx, -232(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $28, %rcx + movq %rcx, -224(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $32, %rcx + movq %rcx, -216(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $36, %rcx + movq %rcx, -208(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $40, %rcx + movq %rcx, -200(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $44, %rcx + movq %rcx, -192(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $48, %rcx + movq %rcx, -184(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $52, %rcx + movq %rcx, -176(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $56, %rcx + movq %rcx, -168(%rbp) # 8-byte Spill + movq %rdi, %rcx + orq $60, %rcx + movq %rcx, -160(%rbp) # 8-byte Spill + movq -56(%rbp), %rdx # 8-byte Reload + movq %rax, -136(%rbp) # 8-byte Spill + movq %rax, -72(%rbp) # 8-byte Spill + xorl %eax, %eax + movq %rdi, -272(%rbp) # 8-byte Spill + .p2align 4, 0x90 +.LBB2_3: # %polly.loop_header20 + # Parent Loop BB2_1 Depth=1 + # Parent Loop BB2_2 Depth=2 # => This Loop Header: Depth=3 - # Child Loop BB2_11 Depth 4 - # Child Loop BB2_17 Depth 5 - # Child Loop BB2_18 Depth 6 - cmpq -72(%rbp), %rax # 8-byte Folded Reload - jg .LBB2_13 -# BB#9: # %polly.loop_header30.preheader - # in Loop: Header=BB2_8 Depth=3 - movq %r9, %rax - orq $63, %rax - cmpq %rax, %r9 - jg .LBB2_13 -# BB#10: # in Loop: Header=BB2_8 Depth=3 - decq %rax - movq -64(%rbp), %r10 # 8-byte Reload - movq -56(%rbp), %r11 # 8-byte Reload - .align 16, 0x90 -.LBB2_11: # %polly.loop_header37.preheader - # Parent Loop BB2_5 Depth=1 - # Parent Loop BB2_15 Depth=2 - # Parent Loop BB2_8 Depth=3 + # Child Loop BB2_4 Depth 4 + # Child Loop BB2_5 Depth 5 + movq %rax, -144(%rbp) # 8-byte Spill + movq %rdx, -152(%rbp) # 8-byte Spill + movq -48(%rbp), %rax # 8-byte Reload + .p2align 4, 0x90 +.LBB2_4: # %polly.loop_header26 + # Parent Loop BB2_1 Depth=1 + # Parent Loop BB2_2 Depth=2 + # Parent Loop BB2_3 Depth=3 # => This Loop Header: Depth=4 - # Child Loop BB2_17 Depth 5 - # Child Loop BB2_18 Depth 6 - cmpq %r13, %r12 - movq %rdx, %r14 - movq %r9, %rcx - jg .LBB2_12 - .align 16, 0x90 -.LBB2_17: # %polly.loop_header46.preheader - # Parent Loop BB2_5 Depth=1 - # Parent Loop BB2_15 Depth=2 - # Parent Loop BB2_8 Depth=3 - # Parent Loop BB2_11 Depth=4 - # => This Loop Header: Depth=5 - # Child Loop BB2_18 Depth 6 - leaq (%r11,%r11,2), %rsi - shlq $11, %rsi - vmovss A(%rsi,%rcx,4), %xmm0 - movq %r10, %rdi - movq %r14, %r8 - movq %r15, %rsi -.LBB2_18: # %polly.loop_header46 - # Parent Loop BB2_5 Depth=1 - # Parent Loop BB2_15 Depth=2 - # Parent Loop BB2_8 Depth=3 - # Parent Loop BB2_11 Depth=4 - # Parent Loop BB2_17 Depth=5 - # => This Inner Loop Header: Depth=6 - vmulss (%r8), %xmm0, %xmm1 - vaddss (%rdi), %xmm1, %xmm1 - vmovss %xmm1, (%rdi) - addq $4, %rdi - addq $4, %r8 - incq %rsi - cmpq %rbx, %rsi - jle .LBB2_18 -# BB#16: # %polly.loop_exit48 - # in Loop: Header=BB2_17 Depth=5 - addq $6144, %r14 # imm = 0x1800 - cmpq %rax, %rcx - leaq 1(%rcx), %rcx - jle .LBB2_17 - .align 16, 0x90 -.LBB2_12: # %polly.loop_exit39 - # in Loop: Header=BB2_11 Depth=4 - addq $6144, %r10 # imm = 0x1800 - cmpq -48(%rbp), %r11 # 8-byte Folded Reload - leaq 1(%r11), %r11 - jle .LBB2_11 - .align 16, 0x90 -.LBB2_13: # %polly.loop_exit32 - # in Loop: Header=BB2_8 Depth=3 - addq $393216, %rdx # imm = 0x60000 - cmpq $1472, %r9 # imm = 0x5C0 - leaq 64(%r9), %r9 - movq -56(%rbp), %rax # 8-byte Reload - jl .LBB2_8 -# BB#14: # %polly.loop_exit25 - # in Loop: Header=BB2_15 Depth=2 - addq $256, -64(%rbp) # 8-byte Folded Spill - # imm = 0x100 + # Child Loop BB2_5 Depth 5 + movq %rax, -376(%rbp) # 8-byte Spill + leaq (%rax,%rax,2), %rax + shlq $11, %rax + leaq C(%rip), %rsi + addq %rsi, %rax + leaq (%rax,%rdi,4), %rcx + movq %rcx, -368(%rbp) # 8-byte Spill + movq -80(%rbp), %rcx # 8-byte Reload + leaq (%rax,%rcx,4), %rcx + movq %rcx, -360(%rbp) # 8-byte Spill + movq -264(%rbp), %rbx # 8-byte Reload + leaq (%rax,%rbx,4), %rcx + movq %rcx, -352(%rbp) # 8-byte Spill + movq -256(%rbp), %r8 # 8-byte Reload + movq %rdi, %rsi + leaq (%rax,%r8,4), %rdi + movq %rdi, -344(%rbp) # 8-byte Spill + movq -248(%rbp), %rdi # 8-byte Reload + leaq (%rax,%rdi,4), %rcx + movq %rcx, -336(%rbp) # 8-byte Spill + movq -240(%rbp), %r9 # 8-byte Reload + leaq (%rax,%r9,4), %rcx + movq %rcx, -328(%rbp) # 8-byte Spill + movq -232(%rbp), %r10 # 8-byte Reload + leaq (%rax,%r10,4), %rcx + movq %rcx, -320(%rbp) # 8-byte Spill + movq -224(%rbp), %r14 # 8-byte Reload + leaq (%rax,%r14,4), %rcx + movq %rcx, -312(%rbp) # 8-byte Spill + movq -216(%rbp), %r15 # 8-byte Reload + leaq (%rax,%r15,4), %rcx + movq %rcx, -304(%rbp) # 8-byte Spill + movq -208(%rbp), %r12 # 8-byte Reload + leaq (%rax,%r12,4), %rcx + movq %rcx, -296(%rbp) # 8-byte Spill + movq -200(%rbp), %r13 # 8-byte Reload + leaq (%rax,%r13,4), %rcx + movq %rcx, -288(%rbp) # 8-byte Spill + movq -192(%rbp), %r11 # 8-byte Reload + leaq (%rax,%r11,4), %rcx + movq %rcx, -280(%rbp) # 8-byte Spill + movaps (%rax,%rsi,4), %xmm15 movq -80(%rbp), %rcx # 8-byte Reload - addq $256, %rcx # imm = 0x100 - addq $64, %r15 - cmpq $1472, %r12 # imm = 0x5C0 - leaq 64(%r12), %r12 - jl .LBB2_15 -# BB#6: # %polly.loop_exit18 - # in Loop: Header=BB2_5 Depth=1 - movq -88(%rbp), %rbx # 8-byte Reload - addq $393216, %rbx # imm = 0x60000 - cmpq $1472, %rax # imm = 0x5C0 - leaq 64(%rax), %rax - jl .LBB2_5 -# BB#7: # %polly.loop_exit11 + movaps (%rax,%rcx,4), %xmm14 + movaps (%rax,%rbx,4), %xmm13 + movaps (%rax,%r8,4), %xmm12 + movaps (%rax,%rdi,4), %xmm11 + movaps (%rax,%r9,4), %xmm10 + movaps (%rax,%r10,4), %xmm9 + movaps (%rax,%r14,4), %xmm8 + movaps (%rax,%r15,4), %xmm7 + movaps (%rax,%r12,4), %xmm6 + movaps (%rax,%r13,4), %xmm5 + movaps (%rax,%r11,4), %xmm4 + movq -184(%rbp), %rcx # 8-byte Reload + movaps (%rax,%rcx,4), %xmm3 + movq -176(%rbp), %rsi # 8-byte Reload + movaps (%rax,%rsi,4), %xmm0 + movaps %xmm0, -96(%rbp) # 16-byte Spill + movq -168(%rbp), %rbx # 8-byte Reload + movaps (%rax,%rbx,4), %xmm0 + movaps %xmm0, -112(%rbp) # 16-byte Spill + movq -160(%rbp), %rdi # 8-byte Reload + movaps (%rax,%rdi,4), %xmm0 + movaps %xmm0, -128(%rbp) # 16-byte Spill + leaq (%rax,%rcx,4), %r8 + leaq (%rax,%rsi,4), %rcx + leaq (%rax,%rbx,4), %rsi + leaq (%rax,%rdi,4), %rax + movq -72(%rbp), %r9 # 8-byte Reload + movl $0, %r10d + .p2align 4, 0x90 +.LBB2_5: # %vector.ph + # Parent Loop BB2_1 Depth=1 + # Parent Loop BB2_2 Depth=2 + # Parent Loop BB2_3 Depth=3 + # Parent Loop BB2_4 Depth=4 + # => This Inner Loop Header: Depth=5 + movss (%rdx,%r10,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + shufps $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] + movaps -240(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm15 + movaps -224(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm14 + movaps -208(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm13 + movaps -192(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm12 + movaps -176(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm11 + movaps -160(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -144(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -128(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -112(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm7 + movaps -96(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm6 + movaps -80(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm5 + movaps -64(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -48(%r9), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm3 + movaps -32(%r9), %xmm1 + mulps %xmm0, %xmm1 + movaps -96(%rbp), %xmm2 # 16-byte Reload + addps %xmm1, %xmm2 + movaps %xmm2, -96(%rbp) # 16-byte Spill + movaps -16(%r9), %xmm1 + mulps %xmm0, %xmm1 + movaps -112(%rbp), %xmm2 # 16-byte Reload + addps %xmm1, %xmm2 + movaps %xmm2, -112(%rbp) # 16-byte Spill + mulps (%r9), %xmm0 + movaps -128(%rbp), %xmm1 # 16-byte Reload + addps %xmm0, %xmm1 + movaps %xmm1, -128(%rbp) # 16-byte Spill + addq $1, %r10 + addq $6144, %r9 # imm = 0x1800 + cmpq $64, %r10 + jne .LBB2_5 +# %bb.6: # %polly.loop_exit34 + # in Loop: Header=BB2_4 Depth=4 + movq -368(%rbp), %rdi # 8-byte Reload + movaps %xmm15, (%rdi) + movq -360(%rbp), %rdi # 8-byte Reload + movaps %xmm14, (%rdi) + movq -352(%rbp), %rdi # 8-byte Reload + movaps %xmm13, (%rdi) + movq -344(%rbp), %rdi # 8-byte Reload + movaps %xmm12, (%rdi) + movq -336(%rbp), %rdi # 8-byte Reload + movaps %xmm11, (%rdi) + movq -328(%rbp), %rdi # 8-byte Reload + movaps %xmm10, (%rdi) + movq -320(%rbp), %rdi # 8-byte Reload + movaps %xmm9, (%rdi) + movq -312(%rbp), %rdi # 8-byte Reload + movaps %xmm8, (%rdi) + movq -304(%rbp), %rdi # 8-byte Reload + movaps %xmm7, (%rdi) + movq -296(%rbp), %rdi # 8-byte Reload + movaps %xmm6, (%rdi) + movq -288(%rbp), %rdi # 8-byte Reload + movaps %xmm5, (%rdi) + movq -280(%rbp), %rdi # 8-byte Reload + movaps %xmm4, (%rdi) + movaps %xmm3, (%r8) + movaps -96(%rbp), %xmm0 # 16-byte Reload + movaps %xmm0, (%rcx) + movaps -112(%rbp), %xmm0 # 16-byte Reload + movaps %xmm0, (%rsi) + movaps -128(%rbp), %xmm0 # 16-byte Reload + movaps %xmm0, (%rax) + movq -376(%rbp), %rax # 8-byte Reload + addq $1, %rax + addq $6144, %rdx # imm = 0x1800 + cmpq -64(%rbp), %rax # 8-byte Folded Reload + movq -272(%rbp), %rdi # 8-byte Reload + jne .LBB2_4 +# %bb.7: # %polly.loop_exit28 + # in Loop: Header=BB2_3 Depth=3 + movq -144(%rbp), %rax # 8-byte Reload + addq $64, %rax + addq $393216, -72(%rbp) # 8-byte Folded Spill + # imm = 0x60000 + movq -152(%rbp), %rdx # 8-byte Reload + addq $256, %rdx # imm = 0x100 + cmpq $1536, %rax # imm = 0x600 + jb .LBB2_3 +# %bb.8: # %polly.loop_exit22 + # in Loop: Header=BB2_2 Depth=2 + addq $64, %rdi + movq -136(%rbp), %rax # 8-byte Reload + addq $256, %rax # imm = 0x100 + cmpq $1536, %rdi # imm = 0x600 + jb .LBB2_2 +# %bb.9: # %polly.loop_exit16 + # in Loop: Header=BB2_1 Depth=1 + movq -48(%rbp), %rax # 8-byte Reload + movq %rax, %rcx + addq $64, %rcx + addq $64, -64(%rbp) # 8-byte Folded Spill + addq $393216, -56(%rbp) # 8-byte Folded Spill + # imm = 0x60000 + movq %rcx, %rax + movq %rcx, -48(%rbp) # 8-byte Spill + cmpq $1536, %rcx # imm = 0x600 + jb .LBB2_1 +# %bb.10: # %polly.exiting xorl %eax, %eax - addq $56, %rsp + addq $344, %rsp # imm = 0x158 popq %rbx popq %r12 popq %r13 popq %r14 popq %r15 popq %rbp - ret -.Ltmp28: - .size main, .Ltmp28-main + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end2: + .size main, .Lfunc_end2-main .cfi_endproc - + # -- End function .type A,@object # @A .comm A,9437184,16 .type B,@object # @B @@ -381,10 +497,11 @@ main: # @main .type .L.str,@object # @.str .section .rodata.str1.1,"aMS",@progbits,1 .L.str: - .asciz "%lf " + .asciz "%lf " .size .L.str, 5 .type C,@object # @C .comm C,9437184,16 + .ident "clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)" .section ".note.GNU-stack","",@progbits diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged.exe b/polly/docs/experiments/matmul/matmul.polly.interchanged.exe Binary files differdeleted file mode 100755 index 240c95a7f79..00000000000 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged.exe +++ /dev/null diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged.ll Binary files differindex 52fbccc7ed5..f56171df7a0 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged.ll +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged.ll diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged.s b/polly/docs/experiments/matmul/matmul.polly.interchanged.s index a764da0b3f2..21770b0a917 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged.s +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged.s @@ -1,275 +1,248 @@ - .file "matmul.polly.interchanged.ll" + .text + .file "matmul.c" .section .rodata.cst8,"aM",@progbits,8 - .align 8 + .p2align 3 # -- Begin function init_array .LCPI0_0: .quad 4602678819172646912 # double 0.5 .text .globl init_array - .align 16, 0x90 + .p2align 4, 0x90 .type init_array,@function init_array: # @init_array .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp2: .cfi_def_cfa_offset 16 -.Ltmp3: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp4: .cfi_def_cfa_register %rbp + leaq B(%rip), %rax + leaq A(%rip), %rcx xorl %r8d, %r8d - vmovsd .LCPI0_0(%rip), %xmm0 - .align 16, 0x90 -.LBB0_1: # %polly.loop_preheader3 + movsd .LCPI0_0(%rip), %xmm0 # xmm0 = mem[0],zero + xorl %r9d, %r9d + .p2align 4, 0x90 +.LBB0_1: # %polly.loop_header # =>This Loop Header: Depth=1 # Child Loop BB0_2 Depth 2 - xorl %ecx, %ecx - .align 16, 0x90 -.LBB0_2: # %polly.loop_header2 + movl $1, %edi + xorl %edx, %edx + .p2align 4, 0x90 +.LBB0_2: # %polly.loop_header1 # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 - movl %ecx, %edx - imull %r8d, %edx movl %edx, %esi - sarl $31, %esi - shrl $22, %esi - addl %edx, %esi - andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 - negl %esi - movq %r8, %rax - shlq $11, %rax - leal 1(%rdx,%rsi), %edi - leaq (%rax,%rax,2), %rsi - leaq 1(%rcx), %rdx - cmpq $1536, %rdx # imm = 0x600 - vcvtsi2sdl %edi, %xmm0, %xmm1 - vmulsd %xmm0, %xmm1, %xmm1 - vcvtsd2ss %xmm1, %xmm1, %xmm1 - vmovss %xmm1, A(%rsi,%rcx,4) - vmovss %xmm1, B(%rsi,%rcx,4) - movq %rdx, %rcx + andl $1022, %esi # imm = 0x3FE + orl $1, %esi + xorps %xmm1, %xmm1 + cvtsi2sdl %esi, %xmm1 + mulsd %xmm0, %xmm1 + cvtsd2ss %xmm1, %xmm1 + movss %xmm1, -4(%rcx,%rdi,4) + movss %xmm1, -4(%rax,%rdi,4) + leal (%r9,%rdx), %esi + andl $1023, %esi # imm = 0x3FF + addl $1, %esi + xorps %xmm1, %xmm1 + cvtsi2sdl %esi, %xmm1 + mulsd %xmm0, %xmm1 + cvtsd2ss %xmm1, %xmm1 + movss %xmm1, (%rcx,%rdi,4) + movss %xmm1, (%rax,%rdi,4) + addq $2, %rdi + addl %r8d, %edx + cmpq $1537, %rdi # imm = 0x601 jne .LBB0_2 -# BB#3: # %polly.loop_exit4 +# %bb.3: # %polly.loop_exit3 # in Loop: Header=BB0_1 Depth=1 - incq %r8 - cmpq $1536, %r8 # imm = 0x600 + addq $1, %r9 + addq $6144, %rax # imm = 0x1800 + addq $6144, %rcx # imm = 0x1800 + addl $2, %r8d + cmpq $1536, %r9 # imm = 0x600 jne .LBB0_1 -# BB#4: # %polly.loop_exit +# %bb.4: # %polly.exiting popq %rbp - ret -.Ltmp5: - .size init_array, .Ltmp5-init_array + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end0: + .size init_array, .Lfunc_end0-init_array .cfi_endproc - - .globl print_array - .align 16, 0x90 + # -- End function + .globl print_array # -- Begin function print_array + .p2align 4, 0x90 .type print_array,@function print_array: # @print_array .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp9: .cfi_def_cfa_offset 16 -.Ltmp10: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp11: .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 + pushq %r13 pushq %r12 pushq %rbx -.Ltmp12: - .cfi_offset %rbx, -48 -.Ltmp13: - .cfi_offset %r12, -40 -.Ltmp14: + pushq %rax + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 .cfi_offset %r14, -32 -.Ltmp15: .cfi_offset %r15, -24 - xorl %r14d, %r14d - movl $C, %r15d - .align 16, 0x90 + leaq C(%rip), %r13 + xorl %eax, %eax + movl $3435973837, %r12d # imm = 0xCCCCCCCD + leaq .L.str(%rip), %r14 + .p2align 4, 0x90 .LBB1_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB1_2 Depth 2 - movq stdout(%rip), %rax - movq %r15, %r12 + movq %rax, -48(%rbp) # 8-byte Spill + movq stdout(%rip), %rsi xorl %ebx, %ebx - .align 16, 0x90 + .p2align 4, 0x90 .LBB1_2: # %for.body3 # Parent Loop BB1_1 Depth=1 # => This Inner Loop Header: Depth=2 - vmovss (%r12), %xmm0 - vcvtss2sd %xmm0, %xmm0, %xmm0 - movq %rax, %rdi - movl $.L.str, %esi + movl %ebx, %eax + imulq %r12, %rax + shrq $38, %rax + leal (%rax,%rax,4), %r15d + shll $4, %r15d + addl $79, %r15d + movss (%r13,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + cvtss2sd %xmm0, %xmm0 movb $1, %al + movq %rsi, %rdi + movq %r14, %rsi callq fprintf - movslq %ebx, %rax - imulq $1717986919, %rax, %rcx # imm = 0x66666667 - movq %rcx, %rdx - shrq $63, %rdx - sarq $37, %rcx - addl %edx, %ecx - imull $80, %ecx, %ecx - subl %ecx, %eax - cmpl $79, %eax + cmpl %ebx, %r15d jne .LBB1_4 -# BB#3: # %if.then +# %bb.3: # %if.then # in Loop: Header=BB1_2 Depth=2 movq stdout(%rip), %rsi movl $10, %edi - callq fputc + callq fputc@PLT .LBB1_4: # %for.inc # in Loop: Header=BB1_2 Depth=2 - addq $4, %r12 - incq %rbx - movq stdout(%rip), %rax + addq $1, %rbx + movq stdout(%rip), %rsi cmpq $1536, %rbx # imm = 0x600 jne .LBB1_2 -# BB#5: # %for.end +# %bb.5: # %for.end # in Loop: Header=BB1_1 Depth=1 movl $10, %edi - movq %rax, %rsi - callq fputc - addq $6144, %r15 # imm = 0x1800 - incq %r14 - cmpq $1536, %r14 # imm = 0x600 + callq fputc@PLT + movq -48(%rbp), %rax # 8-byte Reload + addq $1, %rax + addq $6144, %r13 # imm = 0x1800 + cmpq $1536, %rax # imm = 0x600 jne .LBB1_1 -# BB#6: # %for.end12 +# %bb.6: # %for.end12 + addq $8, %rsp popq %rbx popq %r12 + popq %r13 popq %r14 popq %r15 popq %rbp - ret -.Ltmp16: - .size print_array, .Ltmp16-print_array + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end1: + .size print_array, .Lfunc_end1-print_array .cfi_endproc - - .section .rodata.cst8,"aM",@progbits,8 - .align 8 -.LCPI2_0: - .quad 4602678819172646912 # double 0.5 - .text - .globl main - .align 16, 0x90 + # -- End function + .globl main # -- Begin function main + .p2align 4, 0x90 .type main,@function main: # @main .cfi_startproc -# BB#0: # %entry +# %bb.0: # %entry pushq %rbp -.Ltmp20: .cfi_def_cfa_offset 16 -.Ltmp21: .cfi_offset %rbp, -16 movq %rsp, %rbp -.Ltmp22: .cfi_def_cfa_register %rbp pushq %r14 pushq %rbx -.Ltmp23: .cfi_offset %rbx, -32 -.Ltmp24: .cfi_offset %r14, -24 - xorl %ebx, %ebx - vmovsd .LCPI2_0(%rip), %xmm0 - .align 16, 0x90 -.LBB2_1: # %polly.loop_preheader3.i - # =>This Loop Header: Depth=1 - # Child Loop BB2_2 Depth 2 - xorl %ecx, %ecx - .align 16, 0x90 -.LBB2_2: # %polly.loop_header2.i - # Parent Loop BB2_1 Depth=1 - # => This Inner Loop Header: Depth=2 - movl %ecx, %edx - imull %ebx, %edx - movl %edx, %esi - sarl $31, %esi - shrl $22, %esi - addl %edx, %esi - andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 - negl %esi - movq %rbx, %rax - shlq $11, %rax - leal 1(%rdx,%rsi), %edi - leaq (%rax,%rax,2), %rsi - leaq 1(%rcx), %rdx - cmpq $1536, %rdx # imm = 0x600 - vcvtsi2sdl %edi, %xmm0, %xmm1 - vmulsd %xmm0, %xmm1, %xmm1 - vcvtsd2ss %xmm1, %xmm1, %xmm1 - vmovss %xmm1, A(%rsi,%rcx,4) - vmovss %xmm1, B(%rsi,%rcx,4) - movq %rdx, %rcx - jne .LBB2_2 -# BB#3: # %polly.loop_exit4.i - # in Loop: Header=BB2_1 Depth=1 - incq %rbx - cmpq $1536, %rbx # imm = 0x600 - jne .LBB2_1 -# BB#4: # %polly.loop_preheader3.preheader - movl $C, %r14d - movl $C, %edi + callq init_array + leaq C(%rip), %rbx + xorl %r14d, %r14d xorl %esi, %esi movl $9437184, %edx # imm = 0x900000 - callq memset - xorl %eax, %eax - .align 16, 0x90 -.LBB2_5: # %polly.loop_preheader17 + movq %rbx, %rdi + callq memset@PLT + leaq B(%rip), %rax + leaq A(%rip), %rcx + .p2align 4, 0x90 +.LBB2_1: # %polly.loop_header8 # =>This Loop Header: Depth=1 - # Child Loop BB2_10 Depth 2 - # Child Loop BB2_8 Depth 3 - movl $B, %ebx - xorl %edx, %edx - .align 16, 0x90 -.LBB2_10: # %polly.loop_preheader24 - # Parent Loop BB2_5 Depth=1 + # Child Loop BB2_2 Depth 2 + # Child Loop BB2_3 Depth 3 + movq %rax, %rdx + xorl %esi, %esi + .p2align 4, 0x90 +.LBB2_2: # %polly.loop_header14 + # Parent Loop BB2_1 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB2_8 Depth 3 - leaq (%rax,%rax,2), %rcx - shlq $11, %rcx - vmovss A(%rcx,%rdx,4), %xmm0 - movl $1536, %esi # imm = 0x600 - movq %r14, %rdi - movq %rbx, %rcx - .align 16, 0x90 -.LBB2_8: # %polly.loop_header23 - # Parent Loop BB2_5 Depth=1 - # Parent Loop BB2_10 Depth=2 + # Child Loop BB2_3 Depth 3 + leaq (%r14,%r14,2), %rdi + shlq $11, %rdi + addq %rcx, %rdi + movss (%rdi,%rsi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + shufps $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] + movl $12, %edi + .p2align 4, 0x90 +.LBB2_3: # %vector.body + # Parent Loop BB2_1 Depth=1 + # Parent Loop BB2_2 Depth=2 # => This Inner Loop Header: Depth=3 - vmulss (%rcx), %xmm0, %xmm1 - vaddss (%rdi), %xmm1, %xmm1 - vmovss %xmm1, (%rdi) - addq $4, %rdi - addq $4, %rcx - decq %rsi - jne .LBB2_8 -# BB#9: # %polly.loop_exit25 - # in Loop: Header=BB2_10 Depth=2 + movaps -48(%rdx,%rdi,4), %xmm1 + mulps %xmm0, %xmm1 + movaps -32(%rdx,%rdi,4), %xmm2 + mulps %xmm0, %xmm2 + addps -48(%rbx,%rdi,4), %xmm1 + addps -32(%rbx,%rdi,4), %xmm2 + movaps %xmm1, -48(%rbx,%rdi,4) + movaps %xmm2, -32(%rbx,%rdi,4) + movaps -16(%rdx,%rdi,4), %xmm1 + mulps %xmm0, %xmm1 + movaps (%rdx,%rdi,4), %xmm2 + mulps %xmm0, %xmm2 + addps -16(%rbx,%rdi,4), %xmm1 + addps (%rbx,%rdi,4), %xmm2 + movaps %xmm1, -16(%rbx,%rdi,4) + movaps %xmm2, (%rbx,%rdi,4) + addq $16, %rdi + cmpq $1548, %rdi # imm = 0x60C + jne .LBB2_3 +# %bb.4: # %polly.loop_exit22 + # in Loop: Header=BB2_2 Depth=2 + addq $1, %rsi + addq $6144, %rdx # imm = 0x1800 + cmpq $1536, %rsi # imm = 0x600 + jne .LBB2_2 +# %bb.5: # %polly.loop_exit16 + # in Loop: Header=BB2_1 Depth=1 + addq $1, %r14 addq $6144, %rbx # imm = 0x1800 - incq %rdx - cmpq $1536, %rdx # imm = 0x600 - jne .LBB2_10 -# BB#6: # %polly.loop_exit18 - # in Loop: Header=BB2_5 Depth=1 - addq $6144, %r14 # imm = 0x1800 - incq %rax - cmpq $1536, %rax # imm = 0x600 - jne .LBB2_5 -# BB#7: # %polly.loop_exit11 + cmpq $1536, %r14 # imm = 0x600 + jne .LBB2_1 +# %bb.6: # %polly.exiting xorl %eax, %eax popq %rbx popq %r14 popq %rbp - ret -.Ltmp25: - .size main, .Ltmp25-main + .cfi_def_cfa %rsp, 8 + retq +.Lfunc_end2: + .size main, .Lfunc_end2-main .cfi_endproc - + # -- End function .type A,@object # @A .comm A,9437184,16 .type B,@object # @B @@ -277,10 +250,11 @@ main: # @main .type .L.str,@object # @.str .section .rodata.str1.1,"aMS",@progbits,1 .L.str: - .asciz "%lf " + .asciz "%lf " .size .L.str, 5 .type C,@object # @C .comm C,9437184,16 + .ident "clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)" .section ".note.GNU-stack","",@progbits diff --git a/polly/docs/experiments/matmul/matmul.preopt.ll b/polly/docs/experiments/matmul/matmul.preopt.ll index db536642574..6fe4352b220 100644 --- a/polly/docs/experiments/matmul/matmul.preopt.ll +++ b/polly/docs/experiments/matmul/matmul.preopt.ll @@ -1,4 +1,4 @@ -; ModuleID = 'matmul.s' +; ModuleID = 'matmul.ll' source_filename = "matmul.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -6,15 +6,15 @@ target triple = "x86_64-unknown-linux-gnu" %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } %struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } -@A = common global [1536 x [1536 x float]] zeroinitializer, align 16 -@B = common global [1536 x [1536 x float]] zeroinitializer, align 16 -@stdout = external global %struct._IO_FILE*, align 8 +@A = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16 +@B = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16 +@stdout = external dso_local global %struct._IO_FILE*, align 8 @.str = private unnamed_addr constant [5 x i8] c"%lf \00", align 1 -@C = common global [1536 x [1536 x float]] zeroinitializer, align 16 +@C = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16 @.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 -; Function Attrs: nounwind uwtable -define void @init_array() #0 { +; Function Attrs: noinline nounwind uwtable +define dso_local void @init_array() #0 { entry: br label %entry.split @@ -22,44 +22,37 @@ entry.split: ; preds = %entry br label %for.cond1.preheader for.cond1.preheader: ; preds = %entry.split, %for.inc17 - %indvars.iv5 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next6, %for.inc17 ] + %indvars.iv4 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next5, %for.inc17 ] br label %for.body3 for.body3: ; preds = %for.cond1.preheader, %for.body3 %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] - %0 = mul nuw nsw i64 %indvars.iv, %indvars.iv5 + %0 = mul nuw nsw i64 %indvars.iv, %indvars.iv4 %1 = trunc i64 %0 to i32 - %rem = srem i32 %1, 1024 - %add = add nsw i32 %rem, 1 + %rem = and i32 %1, 1023 + %add = add nuw nsw i32 %rem, 1 %conv = sitofp i32 %add to double %div = fmul double %conv, 5.000000e-01 %conv4 = fptrunc double %div to float - %arrayidx6 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %indvars.iv5, i64 %indvars.iv + %arrayidx6 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %indvars.iv4, i64 %indvars.iv store float %conv4, float* %arrayidx6, align 4 - %2 = mul nuw nsw i64 %indvars.iv, %indvars.iv5 - %3 = trunc i64 %2 to i32 - %rem8 = srem i32 %3, 1024 - %add9 = add nsw i32 %rem8, 1 - %conv10 = sitofp i32 %add9 to double - %div11 = fmul double %conv10, 5.000000e-01 - %conv12 = fptrunc double %div11 to float - %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %indvars.iv5, i64 %indvars.iv - store float %conv12, float* %arrayidx16, align 4 + %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %indvars.iv4, i64 %indvars.iv + store float %conv4, float* %arrayidx16, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp ne i64 %indvars.iv.next, 1536 br i1 %exitcond, label %for.body3, label %for.inc17 for.inc17: ; preds = %for.body3 - %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1 - %exitcond7 = icmp ne i64 %indvars.iv.next6, 1536 - br i1 %exitcond7, label %for.cond1.preheader, label %for.end19 + %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1 + %exitcond6 = icmp ne i64 %indvars.iv.next5, 1536 + br i1 %exitcond6, label %for.cond1.preheader, label %for.end19 for.end19: ; preds = %for.inc17 ret void } -; Function Attrs: nounwind uwtable -define void @print_array() #0 { +; Function Attrs: noinline nounwind uwtable +define dso_local void @print_array() #0 { entry: br label %entry.split @@ -79,7 +72,7 @@ for.body3: ; preds = %for.cond1.preheader %conv = fpext float %2 to double %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2 %3 = trunc i64 %indvars.iv to i32 - %rem = srem i32 %3, 80 + %rem = urem i32 %3, 80 %cmp6 = icmp eq i32 %rem, 79 br i1 %cmp6, label %if.then, label %for.inc @@ -105,10 +98,10 @@ for.end12: ; preds = %for.end ret void } -declare i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1 +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1 -; Function Attrs: nounwind uwtable -define i32 @main() #0 { +; Function Attrs: noinline nounwind uwtable +define dso_local i32 @main() #0 { entry: br label %entry.split @@ -128,16 +121,14 @@ for.body3: ; preds = %for.cond1.preheader for.body8: ; preds = %for.body3, %for.body8 %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body8 ] - %arrayidx12 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4 - %0 = load float, float* %arrayidx12, align 4 + %0 = load float, float* %arrayidx5, align 4 %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %indvars.iv7, i64 %indvars.iv %1 = load float, float* %arrayidx16, align 4 %arrayidx20 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv4 %2 = load float, float* %arrayidx20, align 4 %mul = fmul float %1, %2 %add = fadd float %0, %mul - %arrayidx24 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4 - store float %add, float* %arrayidx24, align 4 + store float %add, float* %arrayidx5, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp ne i64 %indvars.iv.next, 1536 br i1 %exitcond, label %for.body8, label %for.inc25 @@ -162,10 +153,12 @@ declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #2 ; Function Attrs: nounwind declare i32 @fputc(i32, %struct._IO_FILE* nocapture) #2 -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } -!llvm.ident = !{!0} +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} -!0 = !{!"clang version 4.0.0 (http://llvm.org/git/clang.git 081569d9a29c7bc827b2d41f8e62891bbc895e2f) (http://llvm.org/git/llvm.git e117e506536626352e8e47f6c72cd6e2a276622c)"} +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)"} diff --git a/polly/docs/experiments/matmul/runall.sh b/polly/docs/experiments/matmul/runall.sh index 575b58f9824..011d66bf6b4 100755 --- a/polly/docs/experiments/matmul/runall.sh +++ b/polly/docs/experiments/matmul/runall.sh @@ -1,85 +1,83 @@ #!/bin/sh -a echo "--> 1. Create LLVM-IR from C" -clang -S -emit-llvm matmul.c -o matmul.s +clang -S -emit-llvm matmul.c -Xclang -disable-O0-optnone -o matmul.ll echo "--> 2. Prepare the LLVM-IR for Polly" -opt -S -polly-canonicalize matmul.s > matmul.preopt.ll +opt -S -polly-canonicalize matmul.ll -o matmul.preopt.ll echo "--> 3. Show the SCoPs detected by Polly" -opt -basicaa -polly-ast -analyze -q matmul.preopt.ll \ - -polly-process-unprofitable +opt -basicaa -polly-ast -analyze matmul.preopt.ll \ + -polly-process-unprofitable -polly-use-llvm-names echo "--> 4.1 Highlight the detected SCoPs in the CFGs of the program" # We only create .dot files, as directly -view-scops directly calls graphviz # which would require user interaction to continue the script. # opt -basicaa -view-scops -disable-output matmul.preopt.ll -opt -basicaa -dot-scops -disable-output matmul.preopt.ll +opt -basicaa -dot-scops -disable-output matmul.preopt.ll -polly-use-llvm-names echo "--> 4.2 Highlight the detected SCoPs in the CFGs of the program (print \ no instructions)" # We only create .dot files, as directly -view-scops-only directly calls # graphviz which would require user interaction to continue the script. # opt -basicaa -view-scops-only -disable-output matmul.preopt.ll -opt -basicaa -dot-scops-only -disable-output matmul.preopt.ll +opt -basicaa -dot-scops-only -disable-output matmul.preopt.ll -polly-use-llvm-names echo "--> 4.3 Create .png files from the .dot files" for i in `ls *.dot`; do dot -Tpng $i > $i.png; done echo "--> 5. View the polyhedral representation of the SCoPs" -opt -basicaa -polly-scops -analyze matmul.preopt.ll -polly-process-unprofitable +opt -basicaa -polly-scops -analyze matmul.preopt.ll \ + -polly-process-unprofitable -polly-use-llvm-names echo "--> 6. Show the dependences for the SCoPs" opt -basicaa -polly-dependences -analyze matmul.preopt.ll \ - -polly-process-unprofitable + -polly-process-unprofitable -polly-use-llvm-names echo "--> 7. Export jscop files" -opt -basicaa -polly-export-jscop matmul.preopt.ll -polly-process-unprofitable +opt -basicaa -polly-export-jscop matmul.preopt.ll \ + -polly-process-unprofitable -disable-output -polly-use-llvm-names echo "--> 8. Import the updated jscop files and print the new SCoPs. (optional)" opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \ - -polly-process-unprofitable + -polly-process-unprofitable -polly-use-llvm-names opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \ - -polly-import-jscop-postfix=interchanged -polly-process-unprofitable + -polly-import-jscop-postfix=interchanged -polly-process-unprofitable -polly-use-llvm-names opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \ - -polly-import-jscop-postfix=interchanged+tiled -polly-process-unprofitable + -polly-import-jscop-postfix=interchanged+tiled -polly-process-unprofitable -polly-use-llvm-names opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \ -polly-import-jscop-postfix=interchanged+tiled+vector \ - -polly-process-unprofitable + -polly-process-unprofitable -polly-use-llvm-names echo "--> 9. Codegenerate the SCoPs" -opt -basicaa -polly-import-jscop -polly-import-jscop-postfix=interchanged \ - -polly-codegen -polly-process-unprofitable\ - matmul.preopt.ll | opt -O3 > matmul.polly.interchanged.ll -opt -basicaa -polly-import-jscop \ +opt -S -basicaa -polly-import-jscop -polly-import-jscop-postfix=interchanged \ + -polly-codegen -polly-process-unprofitable -polly-use-llvm-names \ + matmul.preopt.ll | opt -O3 -S -o matmul.polly.interchanged.ll +opt -S -basicaa -polly-import-jscop \ -polly-import-jscop-postfix=interchanged+tiled -polly-codegen \ - matmul.preopt.ll -polly-process-unprofitable \ - | opt -O3 > matmul.polly.interchanged+tiled.ll -opt -basicaa -polly-import-jscop -polly-process-unprofitable\ + matmul.preopt.ll -polly-process-unprofitable -polly-use-llvm-names \ + | opt -O3 -S -o matmul.polly.interchanged+tiled.ll +opt -S -basicaa -polly-import-jscop -polly-process-unprofitable\ -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen \ - matmul.preopt.ll -polly-vectorizer=polly\ - | opt -O3 > matmul.polly.interchanged+tiled+vector.ll -opt -basicaa -polly-import-jscop -polly-process-unprofitable\ + matmul.preopt.ll -polly-vectorizer=polly -polly-use-llvm-names \ + | opt -O3 -S -o matmul.polly.interchanged+tiled+vector.ll +opt -S -basicaa -polly-import-jscop -polly-process-unprofitable\ -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen \ - matmul.preopt.ll -polly-vectorizer=polly -polly-parallel\ - | opt -O3 > matmul.polly.interchanged+tiled+vector+openmp.ll -opt matmul.preopt.ll | opt -O3 > matmul.normalopt.ll + matmul.preopt.ll -polly-vectorizer=polly -polly-parallel -polly-use-llvm-names \ + | opt -O3 -S -o matmul.polly.interchanged+tiled+vector+openmp.ll +opt -S matmul.preopt.ll | opt -O3 -S -o matmul.normalopt.ll echo "--> 10. Create the executables" -llc matmul.polly.interchanged.ll -o matmul.polly.interchanged.s && gcc matmul.polly.interchanged.s \ - -o matmul.polly.interchanged.exe -llc matmul.polly.interchanged+tiled.ll -o matmul.polly.interchanged+tiled.s && gcc matmul.polly.interchanged+tiled.s \ - -o matmul.polly.interchanged+tiled.exe -llc matmul.polly.interchanged+tiled+vector.ll \ - -o matmul.polly.interchanged+tiled+vector.s \ - && gcc matmul.polly.interchanged+tiled+vector.s \ - -o matmul.polly.interchanged+tiled+vector.exe -llc matmul.polly.interchanged+tiled+vector+openmp.ll \ - -o matmul.polly.interchanged+tiled+vector+openmp.s \ - && gcc -lgomp matmul.polly.interchanged+tiled+vector+openmp.s \ - -o matmul.polly.interchanged+tiled+vector+openmp.exe -llc matmul.normalopt.ll -o matmul.normalopt.s && gcc matmul.normalopt.s \ - -o matmul.normalopt.exe +llc matmul.polly.interchanged.ll -o matmul.polly.interchanged.s -relocation-model=pic +gcc matmul.polly.interchanged.s -o matmul.polly.interchanged.exe +llc matmul.polly.interchanged+tiled.ll -o matmul.polly.interchanged+tiled.s -relocation-model=pic +gcc matmul.polly.interchanged+tiled.s -o matmul.polly.interchanged+tiled.exe +llc matmul.polly.interchanged+tiled+vector.ll -o matmul.polly.interchanged+tiled+vector.s -relocation-model=pic +gcc matmul.polly.interchanged+tiled+vector.s -o matmul.polly.interchanged+tiled+vector.exe +llc matmul.polly.interchanged+tiled+vector+openmp.ll -o matmul.polly.interchanged+tiled+vector+openmp.s -relocation-model=pic +gcc matmul.polly.interchanged+tiled+vector+openmp.s -lgomp -o matmul.polly.interchanged+tiled+vector+openmp.exe +llc matmul.normalopt.ll -o matmul.normalopt.s -relocation-model=pic +gcc matmul.normalopt.s -lgomp -o matmul.normalopt.exe echo "--> 11. Compare the runtime of the executables" diff --git a/polly/docs/experiments/matmul/scops.init_array.dot b/polly/docs/experiments/matmul/scops.init_array.dot index 3b9d6c9c586..39e2d7e42a8 100644 --- a/polly/docs/experiments/matmul/scops.init_array.dot +++ b/polly/docs/experiments/matmul/scops.init_array.dot @@ -1,39 +1,39 @@ digraph "Scop Graph for 'init_array' function" { label="Scop Graph for 'init_array' function"; - Node0x5b5b5a0 [shape=record,label="{entry:\l br label %entry.split\l}"]; - Node0x5b5b5a0 -> Node0x5b5de30; - Node0x5b5de30 [shape=record,label="{entry.split: \l br label %for.cond1.preheader\l}"]; - Node0x5b5de30 -> Node0x5b5de50; - Node0x5b5de50 [shape=record,label="{for.cond1.preheader: \l %indvars.iv5 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next6, %for.inc17 ]\l br label %for.body3\l}"]; - Node0x5b5de50 -> Node0x5b5b570; - Node0x5b5b570 [shape=record,label="{for.body3: \l %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next,\l... %for.body3 ]\l %0 = mul nuw nsw i64 %indvars.iv, %indvars.iv5\l %1 = trunc i64 %0 to i32\l %rem = srem i32 %1, 1024\l %add = add nsw i32 %rem, 1\l %conv = sitofp i32 %add to double\l %div = fmul double %conv, 5.000000e-01\l %conv4 = fptrunc double %div to float\l %arrayidx6 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @A, i64 0, i64 %indvars.iv5, i64 %indvars.iv\l store float %conv4, float* %arrayidx6, align 4\l %2 = mul nuw nsw i64 %indvars.iv, %indvars.iv5\l %3 = trunc i64 %2 to i32\l %rem8 = srem i32 %3, 1024\l %add9 = add nsw i32 %rem8, 1\l %conv10 = sitofp i32 %add9 to double\l %div11 = fmul double %conv10, 5.000000e-01\l %conv12 = fptrunc double %div11 to float\l %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @B, i64 0, i64 %indvars.iv5, i64 %indvars.iv\l store float %conv12, float* %arrayidx16, align 4\l %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l %exitcond = icmp ne i64 %indvars.iv.next, 1536\l br i1 %exitcond, label %for.body3, label %for.inc17\l}"]; - Node0x5b5b570 -> Node0x5b5b570[constraint=false]; - Node0x5b5b570 -> Node0x5b5df30; - Node0x5b5df30 [shape=record,label="{for.inc17: \l %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1\l %exitcond7 = icmp ne i64 %indvars.iv.next6, 1536\l br i1 %exitcond7, label %for.cond1.preheader, label %for.end19\l}"]; - Node0x5b5df30 -> Node0x5b5de50[constraint=false]; - Node0x5b5df30 -> Node0x5b5df90; - Node0x5b5df90 [shape=record,label="{for.end19: \l ret void\l}"]; + Node0x7fffc6c46ea0 [shape=record,label="{entry:\l br label %entry.split\l}"]; + Node0x7fffc6c46ea0 -> Node0x7fffc6c46f20; + Node0x7fffc6c46f20 [shape=record,label="{entry.split: \l br label %for.cond1.preheader\l}"]; + Node0x7fffc6c46f20 -> Node0x7fffc6c47000; + Node0x7fffc6c47000 [shape=record,label="{for.cond1.preheader: \l %indvars.iv4 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next5, %for.inc17 ]\l br label %for.body3\l}"]; + Node0x7fffc6c47000 -> Node0x7fffc6c47290; + Node0x7fffc6c47290 [shape=record,label="{for.body3: \l %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next,\l... %for.body3 ]\l %0 = mul nuw nsw i64 %indvars.iv, %indvars.iv4\l %1 = trunc i64 %0 to i32\l %rem = and i32 %1, 1023\l %add = add nuw nsw i32 %rem, 1\l %conv = sitofp i32 %add to double\l %div = fmul double %conv, 5.000000e-01\l %conv4 = fptrunc double %div to float\l %arrayidx6 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @A, i64 0, i64 %indvars.iv4, i64 %indvars.iv\l store float %conv4, float* %arrayidx6, align 4\l %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @B, i64 0, i64 %indvars.iv4, i64 %indvars.iv\l store float %conv4, float* %arrayidx16, align 4\l %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l %exitcond = icmp ne i64 %indvars.iv.next, 1536\l br i1 %exitcond, label %for.body3, label %for.inc17\l}"]; + Node0x7fffc6c47290 -> Node0x7fffc6c47290[constraint=false]; + Node0x7fffc6c47290 -> Node0x7fffc6c47b10; + Node0x7fffc6c47b10 [shape=record,label="{for.inc17: \l %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1\l %exitcond6 = icmp ne i64 %indvars.iv.next5, 1536\l br i1 %exitcond6, label %for.cond1.preheader, label %for.end19\l}"]; + Node0x7fffc6c47b10 -> Node0x7fffc6c47000[constraint=false]; + Node0x7fffc6c47b10 -> Node0x7fffc6c48b10; + Node0x7fffc6c48b10 [shape=record,label="{for.end19: \l ret void\l}"]; colorscheme = "paired12" - subgraph cluster_0x5b4bdd0 { + subgraph cluster_0x7fffc6c32540 { label = ""; style = solid; color = 1 - subgraph cluster_0x5b4bf50 { + subgraph cluster_0x7fffc6c32f30 { label = "Region can not profitably be optimized!"; style = solid; color = 6 - subgraph cluster_0x5b4c0d0 { + subgraph cluster_0x7fffc6c32690 { label = ""; style = solid; color = 5 - Node0x5b5b570; + Node0x7fffc6c47290; } - Node0x5b5de50; - Node0x5b5df30; + Node0x7fffc6c47000; + Node0x7fffc6c47b10; } - Node0x5b5b5a0; - Node0x5b5de30; - Node0x5b5df90; + Node0x7fffc6c46ea0; + Node0x7fffc6c46f20; + Node0x7fffc6c48b10; } } diff --git a/polly/docs/experiments/matmul/scops.init_array.dot.png b/polly/docs/experiments/matmul/scops.init_array.dot.png Binary files differindex 48a9f38946a..3cd5eb8a3c8 100644 --- a/polly/docs/experiments/matmul/scops.init_array.dot.png +++ b/polly/docs/experiments/matmul/scops.init_array.dot.png diff --git a/polly/docs/experiments/matmul/scops.main.dot b/polly/docs/experiments/matmul/scops.main.dot index e4abe8fbec8..7c20cbf220f 100644 --- a/polly/docs/experiments/matmul/scops.main.dot +++ b/polly/docs/experiments/matmul/scops.main.dot @@ -1,50 +1,50 @@ digraph "Scop Graph for 'main' function" { label="Scop Graph for 'main' function"; - Node0x5b5c850 [shape=record,label="{entry:\l br label %entry.split\l}"]; - Node0x5b5c850 -> Node0x5b5a440; - Node0x5b5a440 [shape=record,label="{entry.split: \l tail call void @init_array()\l br label %for.cond1.preheader\l}"]; - Node0x5b5a440 -> Node0x5b38cd0; - Node0x5b38cd0 [shape=record,label="{for.cond1.preheader: \l %indvars.iv7 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next8, %for.inc28 ]\l br label %for.body3\l}"]; - Node0x5b38cd0 -> Node0x5b4bd30; - Node0x5b4bd30 [shape=record,label="{for.body3: \l %indvars.iv4 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next5,\l... %for.inc25 ]\l %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4\l store float 0.000000e+00, float* %arrayidx5, align 4\l br label %for.body8\l}"]; - Node0x5b4bd30 -> Node0x5b38c50; - Node0x5b38c50 [shape=record,label="{for.body8: \l %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body8 ]\l %arrayidx12 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4\l %0 = load float, float* %arrayidx12, align 4\l %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @A, i64 0, i64 %indvars.iv7, i64 %indvars.iv\l %1 = load float, float* %arrayidx16, align 4\l %arrayidx20 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv4\l %2 = load float, float* %arrayidx20, align 4\l %mul = fmul float %1, %2\l %add = fadd float %0, %mul\l %arrayidx24 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4\l store float %add, float* %arrayidx24, align 4\l %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l %exitcond = icmp ne i64 %indvars.iv.next, 1536\l br i1 %exitcond, label %for.body8, label %for.inc25\l}"]; - Node0x5b38c50 -> Node0x5b38c50[constraint=false]; - Node0x5b38c50 -> Node0x5b5a290; - Node0x5b5a290 [shape=record,label="{for.inc25: \l %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1\l %exitcond6 = icmp ne i64 %indvars.iv.next5, 1536\l br i1 %exitcond6, label %for.body3, label %for.inc28\l}"]; - Node0x5b5a290 -> Node0x5b4bd30[constraint=false]; - Node0x5b5a290 -> Node0x5b5a340; - Node0x5b5a340 [shape=record,label="{for.inc28: \l %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1\l %exitcond9 = icmp ne i64 %indvars.iv.next8, 1536\l br i1 %exitcond9, label %for.cond1.preheader, label %for.end30\l}"]; - Node0x5b5a340 -> Node0x5b38cd0[constraint=false]; - Node0x5b5a340 -> Node0x5b5a3a0; - Node0x5b5a3a0 [shape=record,label="{for.end30: \l ret i32 0\l}"]; + Node0x7fffc6c4cb90 [shape=record,label="{entry:\l br label %entry.split\l}"]; + Node0x7fffc6c4cb90 -> Node0x7fffc6c47b10; + Node0x7fffc6c47b10 [shape=record,label="{entry.split: \l tail call void @init_array()\l br label %for.cond1.preheader\l}"]; + Node0x7fffc6c47b10 -> Node0x7fffc6c456e0; + Node0x7fffc6c456e0 [shape=record,label="{for.cond1.preheader: \l %indvars.iv7 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next8, %for.inc28 ]\l br label %for.body3\l}"]; + Node0x7fffc6c456e0 -> Node0x7fffc6c3f080; + Node0x7fffc6c3f080 [shape=record,label="{for.body3: \l %indvars.iv4 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next5,\l... %for.inc25 ]\l %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4\l store float 0.000000e+00, float* %arrayidx5, align 4\l br label %for.body8\l}"]; + Node0x7fffc6c3f080 -> Node0x7fffc6c3f220; + Node0x7fffc6c3f220 [shape=record,label="{for.body8: \l %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body8 ]\l %0 = load float, float* %arrayidx5, align 4\l %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @A, i64 0, i64 %indvars.iv7, i64 %indvars.iv\l %1 = load float, float* %arrayidx16, align 4\l %arrayidx20 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv4\l %2 = load float, float* %arrayidx20, align 4\l %mul = fmul float %1, %2\l %add = fadd float %0, %mul\l store float %add, float* %arrayidx5, align 4\l %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l %exitcond = icmp ne i64 %indvars.iv.next, 1536\l br i1 %exitcond, label %for.body8, label %for.inc25\l}"]; + Node0x7fffc6c3f220 -> Node0x7fffc6c3f220[constraint=false]; + Node0x7fffc6c3f220 -> Node0x7fffc6c40480; + Node0x7fffc6c40480 [shape=record,label="{for.inc25: \l %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1\l %exitcond6 = icmp ne i64 %indvars.iv.next5, 1536\l br i1 %exitcond6, label %for.body3, label %for.inc28\l}"]; + Node0x7fffc6c40480 -> Node0x7fffc6c3f080[constraint=false]; + Node0x7fffc6c40480 -> Node0x7fffc6c404e0; + Node0x7fffc6c404e0 [shape=record,label="{for.inc28: \l %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1\l %exitcond9 = icmp ne i64 %indvars.iv.next8, 1536\l br i1 %exitcond9, label %for.cond1.preheader, label %for.end30\l}"]; + Node0x7fffc6c404e0 -> Node0x7fffc6c456e0[constraint=false]; + Node0x7fffc6c404e0 -> Node0x7fffc6c40540; + Node0x7fffc6c40540 [shape=record,label="{for.end30: \l ret i32 0\l}"]; colorscheme = "paired12" - subgraph cluster_0x5b5c970 { + subgraph cluster_0x7fffc6c32540 { label = ""; style = solid; color = 1 - subgraph cluster_0x5b5c5a0 { + subgraph cluster_0x7fffc6c32f30 { label = ""; style = filled; - color = 3 subgraph cluster_0x5b5c9f0 { + color = 3 subgraph cluster_0x7fffc6c32690 { label = ""; style = solid; color = 5 - subgraph cluster_0x5b5c110 { + subgraph cluster_0x7fffc6c32dc0 { label = ""; style = solid; color = 7 - Node0x5b38c50; + Node0x7fffc6c3f220; } - Node0x5b4bd30; - Node0x5b5a290; + Node0x7fffc6c3f080; + Node0x7fffc6c40480; } - Node0x5b38cd0; - Node0x5b5a340; + Node0x7fffc6c456e0; + Node0x7fffc6c404e0; } - Node0x5b5c850; - Node0x5b5a440; - Node0x5b5a3a0; + Node0x7fffc6c4cb90; + Node0x7fffc6c47b10; + Node0x7fffc6c40540; } } diff --git a/polly/docs/experiments/matmul/scops.main.dot.png b/polly/docs/experiments/matmul/scops.main.dot.png Binary files differindex 4e73701a08d..241ddaa39f2 100644 --- a/polly/docs/experiments/matmul/scops.main.dot.png +++ b/polly/docs/experiments/matmul/scops.main.dot.png diff --git a/polly/docs/experiments/matmul/scops.print_array.dot b/polly/docs/experiments/matmul/scops.print_array.dot index 748ccb170cd..5c5c0fca145 100644 --- a/polly/docs/experiments/matmul/scops.print_array.dot +++ b/polly/docs/experiments/matmul/scops.print_array.dot @@ -1,51 +1,51 @@ digraph "Scop Graph for 'print_array' function" { label="Scop Graph for 'print_array' function"; - Node0x5b5ee00 [shape=record,label="{entry:\l br label %entry.split\l}"]; - Node0x5b5ee00 -> Node0x5b5ee50; - Node0x5b5ee50 [shape=record,label="{entry.split: \l br label %for.cond1.preheader\l}"]; - Node0x5b5ee50 -> Node0x5b5ee70; - Node0x5b5ee70 [shape=record,label="{for.cond1.preheader: \l %indvars.iv6 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next7, %for.end ]\l %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l br label %for.body3\l}"]; - Node0x5b5ee70 -> Node0x5b5ee20; - Node0x5b5ee20 [shape=record,label="{for.body3: \l %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next,\l... %for.inc ]\l %1 = phi %struct._IO_FILE* [ %0, %for.cond1.preheader ], [ %5, %for.inc ]\l %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv\l %2 = load float, float* %arrayidx5, align 4\l %conv = fpext float %2 to double\l %call = tail call i32 (%struct._IO_FILE*, i8*, ...)\l... @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x\l... i8]* @.str, i64 0, i64 0), double %conv) #2\l %3 = trunc i64 %indvars.iv to i32\l %rem = srem i32 %3, 80\l %cmp6 = icmp eq i32 %rem, 79\l br i1 %cmp6, label %if.then, label %for.inc\l}"]; - Node0x5b5ee20 -> Node0x5b60d10; - Node0x5b5ee20 -> Node0x5b60d70; - Node0x5b60d10 [shape=record,label="{if.then: \l %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l %fputc3 = tail call i32 @fputc(i32 10, %struct._IO_FILE* %4)\l br label %for.inc\l}"]; - Node0x5b60d10 -> Node0x5b60d70; - Node0x5b60d70 [shape=record,label="{for.inc: \l %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l %exitcond = icmp ne i64 %indvars.iv.next, 1536\l br i1 %exitcond, label %for.body3, label %for.end\l}"]; - Node0x5b60d70 -> Node0x5b5ee20[constraint=false]; - Node0x5b60d70 -> Node0x5b60e10; - Node0x5b60e10 [shape=record,label="{for.end: \l %.lcssa = phi %struct._IO_FILE* [ %5, %for.inc ]\l %fputc = tail call i32 @fputc(i32 10, %struct._IO_FILE* %.lcssa)\l %indvars.iv.next7 = add nuw nsw i64 %indvars.iv6, 1\l %exitcond8 = icmp ne i64 %indvars.iv.next7, 1536\l br i1 %exitcond8, label %for.cond1.preheader, label %for.end12\l}"]; - Node0x5b60e10 -> Node0x5b5ee70[constraint=false]; - Node0x5b60e10 -> Node0x5b60e70; - Node0x5b60e70 [shape=record,label="{for.end12: \l ret void\l}"]; + Node0x7fffc6c42bf0 [shape=record,label="{entry:\l br label %entry.split\l}"]; + Node0x7fffc6c42bf0 -> Node0x7fffc6c42f10; + Node0x7fffc6c42f10 [shape=record,label="{entry.split: \l br label %for.cond1.preheader\l}"]; + Node0x7fffc6c42f10 -> Node0x7fffc6c4abb0; + Node0x7fffc6c4abb0 [shape=record,label="{for.cond1.preheader: \l %indvars.iv6 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next7, %for.end ]\l %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l br label %for.body3\l}"]; + Node0x7fffc6c4abb0 -> Node0x7fffc6c4ac10; + Node0x7fffc6c4ac10 [shape=record,label="{for.body3: \l %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next,\l... %for.inc ]\l %1 = phi %struct._IO_FILE* [ %0, %for.cond1.preheader ], [ %5, %for.inc ]\l %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv\l %2 = load float, float* %arrayidx5, align 4\l %conv = fpext float %2 to double\l %call = tail call i32 (%struct._IO_FILE*, i8*, ...)\l... @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x\l... i8]* @.str, i64 0, i64 0), double %conv) #2\l %3 = trunc i64 %indvars.iv to i32\l %rem = urem i32 %3, 80\l %cmp6 = icmp eq i32 %rem, 79\l br i1 %cmp6, label %if.then, label %for.inc\l}"]; + Node0x7fffc6c4ac10 -> Node0x7fffc6c4af80; + Node0x7fffc6c4ac10 -> Node0x7fffc6c4afe0; + Node0x7fffc6c4af80 [shape=record,label="{if.then: \l %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l %fputc3 = tail call i32 @fputc(i32 10, %struct._IO_FILE* %4)\l br label %for.inc\l}"]; + Node0x7fffc6c4af80 -> Node0x7fffc6c4afe0; + Node0x7fffc6c4afe0 [shape=record,label="{for.inc: \l %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l %exitcond = icmp ne i64 %indvars.iv.next, 1536\l br i1 %exitcond, label %for.body3, label %for.end\l}"]; + Node0x7fffc6c4afe0 -> Node0x7fffc6c4ac10[constraint=false]; + Node0x7fffc6c4afe0 -> Node0x7fffc6c4b3b0; + Node0x7fffc6c4b3b0 [shape=record,label="{for.end: \l %.lcssa = phi %struct._IO_FILE* [ %5, %for.inc ]\l %fputc = tail call i32 @fputc(i32 10, %struct._IO_FILE* %.lcssa)\l %indvars.iv.next7 = add nuw nsw i64 %indvars.iv6, 1\l %exitcond8 = icmp ne i64 %indvars.iv.next7, 1536\l br i1 %exitcond8, label %for.cond1.preheader, label %for.end12\l}"]; + Node0x7fffc6c4b3b0 -> Node0x7fffc6c4abb0[constraint=false]; + Node0x7fffc6c4b3b0 -> Node0x7fffc6c4b580; + Node0x7fffc6c4b580 [shape=record,label="{for.end12: \l ret void\l}"]; colorscheme = "paired12" - subgraph cluster_0x5b349a0 { + subgraph cluster_0x7fffc6c32540 { label = ""; style = solid; color = 1 - subgraph cluster_0x5b5c2c0 { + subgraph cluster_0x7fffc6c32dc0 { label = "Call instruction: %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2"; style = solid; color = 6 - subgraph cluster_0x5b5c240 { + subgraph cluster_0x7fffc6c32690 { label = "Call instruction: %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2"; style = solid; color = 5 - subgraph cluster_0x5b34a20 { + subgraph cluster_0x7fffc6c32f30 { label = "Region can not profitably be optimized!"; style = solid; color = 7 - Node0x5b5ee20; - Node0x5b60d10; + Node0x7fffc6c4ac10; + Node0x7fffc6c4af80; } - Node0x5b60d70; + Node0x7fffc6c4afe0; } - Node0x5b5ee70; - Node0x5b60e10; + Node0x7fffc6c4abb0; + Node0x7fffc6c4b3b0; } - Node0x5b5ee00; - Node0x5b5ee50; - Node0x5b60e70; + Node0x7fffc6c42bf0; + Node0x7fffc6c42f10; + Node0x7fffc6c4b580; } } diff --git a/polly/docs/experiments/matmul/scops.print_array.dot.png b/polly/docs/experiments/matmul/scops.print_array.dot.png Binary files differindex e3b973b131a..75c4db88de6 100644 --- a/polly/docs/experiments/matmul/scops.print_array.dot.png +++ b/polly/docs/experiments/matmul/scops.print_array.dot.png diff --git a/polly/docs/experiments/matmul/scopsonly.init_array.dot b/polly/docs/experiments/matmul/scopsonly.init_array.dot index 3d2092b21c9..a8907a9be82 100644 --- a/polly/docs/experiments/matmul/scopsonly.init_array.dot +++ b/polly/docs/experiments/matmul/scopsonly.init_array.dot @@ -1,39 +1,39 @@ digraph "Scop Graph for 'init_array' function" { label="Scop Graph for 'init_array' function"; - Node0x5ae2570 [shape=record,label="{entry}"]; - Node0x5ae2570 -> Node0x5ae4e90; - Node0x5ae4e90 [shape=record,label="{entry.split}"]; - Node0x5ae4e90 -> Node0x5ae4f50; - Node0x5ae4f50 [shape=record,label="{for.cond1.preheader}"]; - Node0x5ae4f50 -> Node0x5ae50e0; - Node0x5ae50e0 [shape=record,label="{for.body3}"]; - Node0x5ae50e0 -> Node0x5ae50e0[constraint=false]; - Node0x5ae50e0 -> Node0x5ae5100; - Node0x5ae5100 [shape=record,label="{for.inc17}"]; - Node0x5ae5100 -> Node0x5ae4f50[constraint=false]; - Node0x5ae5100 -> Node0x5ae4ff0; - Node0x5ae4ff0 [shape=record,label="{for.end19}"]; + Node0x7fffdb5cceb0 [shape=record,label="{entry}"]; + Node0x7fffdb5cceb0 -> Node0x7fffdb5ccf00; + Node0x7fffdb5ccf00 [shape=record,label="{entry.split}"]; + Node0x7fffdb5ccf00 -> Node0x7fffdb5ccf80; + Node0x7fffdb5ccf80 [shape=record,label="{for.cond1.preheader}"]; + Node0x7fffdb5ccf80 -> Node0x7fffdb5cd090; + Node0x7fffdb5cd090 [shape=record,label="{for.body3}"]; + Node0x7fffdb5cd090 -> Node0x7fffdb5cd090[constraint=false]; + Node0x7fffdb5cd090 -> Node0x7fffdb5cd0b0; + Node0x7fffdb5cd0b0 [shape=record,label="{for.inc17}"]; + Node0x7fffdb5cd0b0 -> Node0x7fffdb5ccf80[constraint=false]; + Node0x7fffdb5cd0b0 -> Node0x7fffdb5cd2a0; + Node0x7fffdb5cd2a0 [shape=record,label="{for.end19}"]; colorscheme = "paired12" - subgraph cluster_0x5ad2dd0 { + subgraph cluster_0x7fffdb5b8530 { label = ""; style = solid; color = 1 - subgraph cluster_0x5ad2f50 { + subgraph cluster_0x7fffdb5b8f40 { label = "Region can not profitably be optimized!"; style = solid; color = 6 - subgraph cluster_0x5ad30d0 { + subgraph cluster_0x7fffdb5b86a0 { label = ""; style = solid; color = 5 - Node0x5ae50e0; + Node0x7fffdb5cd090; } - Node0x5ae4f50; - Node0x5ae5100; + Node0x7fffdb5ccf80; + Node0x7fffdb5cd0b0; } - Node0x5ae2570; - Node0x5ae4e90; - Node0x5ae4ff0; + Node0x7fffdb5cceb0; + Node0x7fffdb5ccf00; + Node0x7fffdb5cd2a0; } } diff --git a/polly/docs/experiments/matmul/scopsonly.init_array.dot.png b/polly/docs/experiments/matmul/scopsonly.init_array.dot.png Binary files differindex f101d4d3081..bdfae23e0ff 100644 --- a/polly/docs/experiments/matmul/scopsonly.init_array.dot.png +++ b/polly/docs/experiments/matmul/scopsonly.init_array.dot.png diff --git a/polly/docs/experiments/matmul/scopsonly.main.dot b/polly/docs/experiments/matmul/scopsonly.main.dot index c2d60c7ded6..9793a24baef 100644 --- a/polly/docs/experiments/matmul/scopsonly.main.dot +++ b/polly/docs/experiments/matmul/scopsonly.main.dot @@ -1,50 +1,50 @@ digraph "Scop Graph for 'main' function" { label="Scop Graph for 'main' function"; - Node0x5abfcf0 [shape=record,label="{entry}"]; - Node0x5abfcf0 -> Node0x5ade060; - Node0x5ade060 [shape=record,label="{entry.split}"]; - Node0x5ade060 -> Node0x5ade0e0; - Node0x5ade0e0 [shape=record,label="{for.cond1.preheader}"]; - Node0x5ade0e0 -> Node0x5ade100; - Node0x5ade100 [shape=record,label="{for.body3}"]; - Node0x5ade100 -> Node0x5ae0020; - Node0x5ae0020 [shape=record,label="{for.body8}"]; - Node0x5ae0020 -> Node0x5ae0020[constraint=false]; - Node0x5ae0020 -> Node0x5ae0080; - Node0x5ae0080 [shape=record,label="{for.inc25}"]; - Node0x5ae0080 -> Node0x5ade100[constraint=false]; - Node0x5ae0080 -> Node0x5adfef0; - Node0x5adfef0 [shape=record,label="{for.inc28}"]; - Node0x5adfef0 -> Node0x5ade0e0[constraint=false]; - Node0x5adfef0 -> Node0x5adff50; - Node0x5adff50 [shape=record,label="{for.end30}"]; + Node0x7fffdb5cbd10 [shape=record,label="{entry}"]; + Node0x7fffdb5cbd10 -> Node0x7fffdb5c7140; + Node0x7fffdb5c7140 [shape=record,label="{entry.split}"]; + Node0x7fffdb5c7140 -> Node0x7fffdb5c7200; + Node0x7fffdb5c7200 [shape=record,label="{for.cond1.preheader}"]; + Node0x7fffdb5c7200 -> Node0x7fffdb5ccd60; + Node0x7fffdb5ccd60 [shape=record,label="{for.body3}"]; + Node0x7fffdb5ccd60 -> Node0x7fffdb5ccd80; + Node0x7fffdb5ccd80 [shape=record,label="{for.body8}"]; + Node0x7fffdb5ccd80 -> Node0x7fffdb5ccd80[constraint=false]; + Node0x7fffdb5ccd80 -> Node0x7fffdb5cce20; + Node0x7fffdb5cce20 [shape=record,label="{for.inc25}"]; + Node0x7fffdb5cce20 -> Node0x7fffdb5ccd60[constraint=false]; + Node0x7fffdb5cce20 -> Node0x7fffdb5cce80; + Node0x7fffdb5cce80 [shape=record,label="{for.inc28}"]; + Node0x7fffdb5cce80 -> Node0x7fffdb5c7200[constraint=false]; + Node0x7fffdb5cce80 -> Node0x7fffdb5ccee0; + Node0x7fffdb5ccee0 [shape=record,label="{for.end30}"]; colorscheme = "paired12" - subgraph cluster_0x5ad2c80 { + subgraph cluster_0x7fffdb5b8530 { label = ""; style = solid; color = 1 - subgraph cluster_0x5ad2e50 { + subgraph cluster_0x7fffdb5b8f40 { label = ""; style = filled; - color = 3 subgraph cluster_0x5ad2d00 { + color = 3 subgraph cluster_0x7fffdb5b86a0 { label = ""; style = solid; color = 5 - subgraph cluster_0x5ad2dd0 { + subgraph cluster_0x7fffdb5cc3c0 { label = ""; style = solid; color = 7 - Node0x5ae0020; + Node0x7fffdb5ccd80; } - Node0x5ade100; - Node0x5ae0080; + Node0x7fffdb5ccd60; + Node0x7fffdb5cce20; } - Node0x5ade0e0; - Node0x5adfef0; + Node0x7fffdb5c7200; + Node0x7fffdb5cce80; } - Node0x5abfcf0; - Node0x5ade060; - Node0x5adff50; + Node0x7fffdb5cbd10; + Node0x7fffdb5c7140; + Node0x7fffdb5ccee0; } } diff --git a/polly/docs/experiments/matmul/scopsonly.main.dot.png b/polly/docs/experiments/matmul/scopsonly.main.dot.png Binary files differindex 32634243888..92124ee467e 100644 --- a/polly/docs/experiments/matmul/scopsonly.main.dot.png +++ b/polly/docs/experiments/matmul/scopsonly.main.dot.png diff --git a/polly/docs/experiments/matmul/scopsonly.print_array.dot b/polly/docs/experiments/matmul/scopsonly.print_array.dot index 0f7de45e877..7d9a8aea882 100644 --- a/polly/docs/experiments/matmul/scopsonly.print_array.dot +++ b/polly/docs/experiments/matmul/scopsonly.print_array.dot @@ -1,51 +1,51 @@ digraph "Scop Graph for 'print_array' function" { label="Scop Graph for 'print_array' function"; - Node0x5ae5e30 [shape=record,label="{entry}"]; - Node0x5ae5e30 -> Node0x5ae5f50; - Node0x5ae5f50 [shape=record,label="{entry.split}"]; - Node0x5ae5f50 -> Node0x5ae7d90; - Node0x5ae7d90 [shape=record,label="{for.cond1.preheader}"]; - Node0x5ae7d90 -> Node0x5ae7f20; - Node0x5ae7f20 [shape=record,label="{for.body3}"]; - Node0x5ae7f20 -> Node0x5ae7f40; - Node0x5ae7f20 -> Node0x5ae7f60; - Node0x5ae7f40 [shape=record,label="{if.then}"]; - Node0x5ae7f40 -> Node0x5ae7f60; - Node0x5ae7f60 [shape=record,label="{for.inc}"]; - Node0x5ae7f60 -> Node0x5ae7f20[constraint=false]; - Node0x5ae7f60 -> Node0x5ae7e30; - Node0x5ae7e30 [shape=record,label="{for.end}"]; - Node0x5ae7e30 -> Node0x5ae7d90[constraint=false]; - Node0x5ae7e30 -> Node0x5ae8110; - Node0x5ae8110 [shape=record,label="{for.end12}"]; + Node0x7fffdb5c9180 [shape=record,label="{entry}"]; + Node0x7fffdb5c9180 -> Node0x7fffdb5b7940; + Node0x7fffdb5b7940 [shape=record,label="{entry.split}"]; + Node0x7fffdb5b7940 -> Node0x7fffdb5b7960; + Node0x7fffdb5b7960 [shape=record,label="{for.cond1.preheader}"]; + Node0x7fffdb5b7960 -> Node0x7fffdb5b79c0; + Node0x7fffdb5b79c0 [shape=record,label="{for.body3}"]; + Node0x7fffdb5b79c0 -> Node0x7fffdb5b79e0; + Node0x7fffdb5b79c0 -> Node0x7fffdb5b7a80; + Node0x7fffdb5b79e0 [shape=record,label="{if.then}"]; + Node0x7fffdb5b79e0 -> Node0x7fffdb5b7a80; + Node0x7fffdb5b7a80 [shape=record,label="{for.inc}"]; + Node0x7fffdb5b7a80 -> Node0x7fffdb5b79c0[constraint=false]; + Node0x7fffdb5b7a80 -> Node0x7fffdb5b7ae0; + Node0x7fffdb5b7ae0 [shape=record,label="{for.end}"]; + Node0x7fffdb5b7ae0 -> Node0x7fffdb5b7960[constraint=false]; + Node0x7fffdb5b7ae0 -> Node0x7fffdb5b7b40; + Node0x7fffdb5b7b40 [shape=record,label="{for.end12}"]; colorscheme = "paired12" - subgraph cluster_0x5abb9a0 { + subgraph cluster_0x7fffdb5b8530 { label = ""; style = solid; color = 1 - subgraph cluster_0x5ae32c0 { + subgraph cluster_0x7fffdb5cc3c0 { label = "Call instruction: %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2"; style = solid; color = 6 - subgraph cluster_0x5ae3240 { + subgraph cluster_0x7fffdb5b86a0 { label = "Call instruction: %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2"; style = solid; color = 5 - subgraph cluster_0x5abba20 { + subgraph cluster_0x7fffdb5b8f40 { label = "Region can not profitably be optimized!"; style = solid; color = 7 - Node0x5ae7f20; - Node0x5ae7f40; + Node0x7fffdb5b79c0; + Node0x7fffdb5b79e0; } - Node0x5ae7f60; + Node0x7fffdb5b7a80; } - Node0x5ae7d90; - Node0x5ae7e30; + Node0x7fffdb5b7960; + Node0x7fffdb5b7ae0; } - Node0x5ae5e30; - Node0x5ae5f50; - Node0x5ae8110; + Node0x7fffdb5c9180; + Node0x7fffdb5b7940; + Node0x7fffdb5b7b40; } } diff --git a/polly/docs/experiments/matmul/scopsonly.print_array.dot.png b/polly/docs/experiments/matmul/scopsonly.print_array.dot.png Binary files differindex b0d4b45aace..f5b8e2eddb0 100644 --- a/polly/docs/experiments/matmul/scopsonly.print_array.dot.png +++ b/polly/docs/experiments/matmul/scopsonly.print_array.dot.png |