aboutsummaryrefslogtreecommitdiff
path: root/src/llvmopencl/WorkitemLoops.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/llvmopencl/WorkitemLoops.cc')
-rw-r--r--src/llvmopencl/WorkitemLoops.cc1061
1 files changed, 1061 insertions, 0 deletions
diff --git a/src/llvmopencl/WorkitemLoops.cc b/src/llvmopencl/WorkitemLoops.cc
new file mode 100644
index 0000000..91eb055
--- /dev/null
+++ b/src/llvmopencl/WorkitemLoops.cc
@@ -0,0 +1,1061 @@
+// LLVM function pass to create loops that run all the work items
+// in a work group while respecting barrier synchronization points.
+//
+// Copyright (c) 2012-2014 Pekka Jääskeläinen / Tampere University of Technology
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#define DEBUG_TYPE "workitem-loops"
+
+#include "WorkitemLoops.h"
+#include "Workgroup.h"
+#include "Barrier.h"
+#include "Kernel.h"
+#include "config.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/CommandLine.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/TypeBuilder.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/TypeBuilder.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#endif
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#include <llvm/Support/InstIterator.h>
+#include "WorkitemHandlerChooser.h"
+
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
+//#define DUMP_RESULT_CFG
+
+#ifdef DUMP_RESULT_CFG
+#include "llvm/Analysis/CFGPrinter.h"
+#endif
+
+//#define DEBUG_WORK_ITEM_LOOPS
+
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<WorkitemLoops> X("workitemloops",
+ "Workitem loop generation pass");
+}
+
+char WorkitemLoops::ID = 0;
+
+void
+WorkitemLoops::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DominatorTree>();
+ AU.addRequired<PostDominatorTree>();
+ AU.addRequired<LoopInfo>();
+// TODO - Removed due to compilation error
+#if 0
+#ifdef LLVM_3_1
+ AU.addRequired<TargetData>();
+#else
+ AU.addRequired<DataLayout>();
+#endif
+#endif
+ AU.addRequired<pocl::WorkitemHandlerChooser>();
+}
+
+bool
+WorkitemLoops::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+ if (getAnalysis<pocl::WorkitemHandlerChooser>().chosenHandler() !=
+ pocl::WorkitemHandlerChooser::POCL_WIH_LOOPS)
+ return false;
+
+ DT = &getAnalysis<DominatorTree>();
+ LI = &getAnalysis<LoopInfo>();
+ PDT = &getAnalysis<PostDominatorTree>();
+
+ tempInstructionIndex = 0;
+
+#if 0
+ std::cerr << "### original:" << std::endl;
+ F.viewCFG();
+#endif
+
+ bool changed = ProcessFunction(F);
+#ifdef DUMP_RESULT_CFG
+ FunctionPass* cfgPrinter = createCFGOnlyPrinterPass();
+ cfgPrinter->runOnFunction(F);
+#endif
+
+#if 0
+ std::cerr << "### after:" << std::endl;
+ F.viewCFG();
+#endif
+
+ changed |= fixUndominatedVariableUses(DT, F);
+
+#if 0
+ /* Split large BBs so we can print the Dot without it crashing. */
+ bool fchanged = false;
+ const int MAX_INSTRUCTIONS_PER_BB = 70;
+ do {
+ fchanged = false;
+ for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
+ BasicBlock *b = i;
+
+ if (b->size() > MAX_INSTRUCTIONS_PER_BB + 1)
+ {
+ int count = 0;
+ BasicBlock::iterator splitPoint = b->begin();
+ while (count < MAX_INSTRUCTIONS_PER_BB || isa<PHINode>(splitPoint))
+ {
+ ++splitPoint;
+ ++count;
+ }
+ SplitBlock(b, splitPoint, this);
+ fchanged = true;
+ break;
+ }
+ }
+
+ } while (fchanged);
+
+ F.viewCFG();
+#endif
+ contextArrays.clear();
+ tempInstructionIds.clear();
+
+ return changed;
+}
+
+std::pair<llvm::BasicBlock *, llvm::BasicBlock *>
+WorkitemLoops::CreateLoopAround
+(ParallelRegion &region,
+ llvm::BasicBlock *entryBB, llvm::BasicBlock *exitBB,
+ bool peeledFirst, llvm::Value *localIdVar, size_t LocalSizeForDim,
+ bool addIncBlock, llvm::Instruction *lsizeDim)
+{
+ assert (localIdVar != NULL);
+
+ /*
+
+ Generate a structure like this for each loop level (x,y,z):
+
+ for.init:
+
+ ; if peeledFirst is false:
+ store i32 0, i32* %_local_id_x, align 4
+
+ ; if peeledFirst is true (assume the 0,0,0 iteration has been executed earlier)
+ ; assume _local_id_x_first is is initialized to 1 in the peeled pregion copy
+ store _local_id_x_first, i32* %_local_id_x, align 4
+ store i32 0, %_local_id_x_first
+
+ br label %for.body
+
+ for.body:
+
+ ; the parallel region code here
+
+ br label %for.inc
+
+ for.inc:
+
+ ; Separated inc and cond check blocks for easier loop unrolling later on.
+ ; Can then chain N times for.body+for.inc to unroll.
+
+ %2 = load i32* %_local_id_x, align 4
+ %inc = add nsw i32 %2, 1
+
+ store i32 %inc, i32* %_local_id_x, align 4
+ br label %for.cond
+
+ for.cond:
+
+ ; loop header, compare the id to the local size
+ %0 = load i32* %_local_id_x, align 4
+ %cmp = icmp ult i32 %0, i32 123
+ br i1 %cmp, label %for.body, label %for.end
+
+ for.end:
+
+ OPTIMIZE: Use a separate iteration variable across all the loops to iterate the context
+ data arrays to avoid needing multiplications to find the correct location, and to
+ enable easy vectorization of loading the context data when there are parallel iterations.
+ */
+
+ llvm::BasicBlock *loopBodyEntryBB = entryBB;
+ llvm::LLVMContext &C = loopBodyEntryBB->getContext();
+ llvm::Function *F = loopBodyEntryBB->getParent();
+ loopBodyEntryBB->setName("pregion.for.body");
+
+ assert (exitBB->getTerminator()->getNumSuccessors() == 1);
+
+ llvm::BasicBlock *oldExit = exitBB->getTerminator()->getSuccessor(0);
+
+ llvm::BasicBlock *forInitBB =
+ BasicBlock::Create(C, "pregion.for.init", F, loopBodyEntryBB);
+
+ llvm::BasicBlock *loopEndBB =
+ BasicBlock::Create(C, "pregion.for.end", F, exitBB);
+
+ llvm::BasicBlock *forCondBB =
+ BasicBlock::Create(C, "pregion.for.cond", F, exitBB);
+
+ DT->runOnFunction(*F);
+
+ // F->viewCFG();
+ /* Fix the old edges jumping to the region to jump to the basic block
+ that starts the created loop. Back edges should still point to the
+ old basic block so we preserve the old loops. */
+ BasicBlockVector preds;
+ llvm::pred_iterator PI =
+ llvm::pred_begin(entryBB),
+ E = llvm::pred_end(entryBB);
+
+ for (; PI != E; ++PI)
+ {
+ llvm::BasicBlock *bb = *PI;
+ preds.push_back(bb);
+ }
+
+ for (BasicBlockVector::iterator i = preds.begin();
+ i != preds.end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ /* Do not fix loop edges inside the region. The loop
+ is replicated as a whole to the body of the wi-loop.*/
+ if (DT->dominates(loopBodyEntryBB, bb))
+ continue;
+ bb->getTerminator()->replaceUsesOfWith(loopBodyEntryBB, forInitBB);
+ }
+
+ IRBuilder<> builder(forInitBB);
+
+ if (peeledFirst)
+ {
+ builder.CreateStore(builder.CreateLoad(localIdXFirstVar), localIdVar);
+ builder.CreateStore
+ (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdXFirstVar);
+ }
+ else
+ {
+ builder.CreateStore
+ (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdVar);
+ }
+
+ builder.CreateBr(loopBodyEntryBB);
+
+ exitBB->getTerminator()->replaceUsesOfWith(oldExit, forCondBB);
+ if (addIncBlock)
+ {
+ AppendIncBlock(exitBB, localIdVar);
+ }
+
+ builder.SetInsertPoint(forCondBB);
+
+ llvm::Value *cmpResult;
+ if (lsizeDim == NULL)
+ {
+ cmpResult =
+ builder.CreateICmpULT
+ (builder.CreateLoad(localIdVar),
+ (ConstantInt::get
+ (IntegerType::get(C, size_t_width),
+ LocalSizeForDim))
+ );
+ }
+ else
+ {
+ cmpResult =
+ builder.CreateICmpULT
+ (builder.CreateLoad(localIdVar),
+ lsizeDim
+ );
+ }
+
+ Instruction *loopBranch =
+ builder.CreateCondBr(cmpResult, loopBodyEntryBB, loopEndBB);
+
+ /* Add the metadata to mark a parallel loop. The metadata
+ refer to a loop-unique dummy metadata that is not merged
+ automatically. */
+
+ /* This creation of the identifier metadata is copied from
+ LLVM's MDBuilder::createAnonymousTBAARoot(). */
+ MDNode *Dummy = MDNode::getTemporary(C, ArrayRef<Value*>());
+ MDNode *Root = MDNode::get(C, Dummy);
+ // At this point we have
+ // !0 = metadata !{} <- dummy
+ // !1 = metadata !{metadata !0} <- root
+ // Replace the dummy operand with the root node itself and delete the dummy.
+ Root->replaceOperandWith(0, Root);
+ MDNode::deleteTemporary(Dummy);
+ // We now have
+ // !1 = metadata !{metadata !1} <- self-referential root
+
+ loopBranch->setMetadata("llvm.loop.parallel", Root);
+ region.AddParallelLoopMetadata(Root);
+
+ builder.SetInsertPoint(loopEndBB);
+ builder.CreateBr(oldExit);
+
+ return std::make_pair(forInitBB, loopEndBB);
+}
+
+ParallelRegion*
+WorkitemLoops::RegionOfBlock(llvm::BasicBlock *bb)
+{
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *region = (*i);
+ if (region->HasBlock(bb)) return region;
+ }
+ return NULL;
+}
+
+// PreAnalyze kernel function, find out dimension (borrowed from wga)
+// PreCreate local sizes which are workgroup invariant
+void WorkitemLoops::FindKernelDim(Function &F)
+{
+ maxDim = 1;
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
+ if (CallInst * callInst = dyn_cast<CallInst>(&*I))
+ {
+ if (!callInst->getCalledFunction()) continue;
+ std::string functionName(callInst->getCalledFunction()->getName());
+
+ if (functionName == "get_local_id" ||
+ functionName == "get_global_id")
+ {
+ Value *arg = callInst->getArgOperand(0);
+ if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg))
+ {
+ unsigned int dimIdx = constInt->getSExtValue();
+ dimIdx = (MAX_DIMENSIONS-1 < dimIdx) ? MAX_DIMENSIONS-1 : dimIdx;
+ maxDim = (maxDim < dimIdx + 1) ? dimIdx+1 : maxDim;
+ }
+
+ /*-------------------------------------------------------------
+ * if the work group function has a variable argument, then
+ * assume worst case and return 3 loop levels are needed.
+ *------------------------------------------------------------*/
+ else
+ {
+ maxDim = 3;
+ break;
+ }
+ }
+ }
+
+ llvm::Module *M = F.getParent();
+ llvm::Type *Int32 = IntegerType::get(M->getContext(), 32);
+ FunctionType *ft = FunctionType::get
+ (/*Result=*/ Int32,
+ /*Params=*/ Int32,
+ /*isVarArg=*/ false);
+ Function *f_localsize =
+ dyn_cast<Function>(M->getOrInsertFunction("get_local_size", ft));
+ SmallVector<Value *, 4> argsx, argsy, argsz;
+ argsx.push_back(ConstantInt::get(Int32, 0));
+ lsizeX = CallInst::Create(f_localsize, ArrayRef<Value *>(argsx));
+ if (maxDim > 1)
+ {
+ argsy.push_back(ConstantInt::get(Int32, 1));
+ lsizeY = CallInst::Create(f_localsize, ArrayRef<Value *>(argsy));
+ }
+ if (maxDim > 2)
+ {
+ argsz.push_back(ConstantInt::get(Int32, 2));
+ lsizeZ = CallInst::Create(f_localsize, ArrayRef<Value *>(argsz));
+ }
+}
+
+bool
+WorkitemLoops::ProcessFunction(Function &F)
+{
+ Kernel *K = cast<Kernel> (&F);
+ Initialize(K);
+
+#if 0 // TODO: do something for reqd_work_group_size
+ unsigned workItemCount = LocalSizeX*LocalSizeY*LocalSizeZ;
+ if (workItemCount == 1)
+ {
+ K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ);
+ ParallelRegion::insertLocalIdInit(&F.getEntryBlock(), 0, 0, 0);
+ return true;
+ }
+#endif
+
+ FindKernelDim(F);
+
+ original_parallel_regions =
+ K->getParallelRegions(LI);
+
+ IRBuilder<> builder(F.getEntryBlock().getFirstInsertionPt());
+ localIdXFirstVar =
+ builder.CreateAlloca
+ (IntegerType::get(F.getContext(), size_t_width), 0, ".pocl.local_id_x_init");
+
+ // F.viewCFGOnly();
+
+#if 0
+ std::cerr << "### Original" << std::endl;
+ F.viewCFG();
+#endif
+
+#if 0
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *region = (*i);
+ region->InjectRegionPrintF();
+ region->InjectVariablePrintouts();
+ }
+#endif
+
+ /* Count how many parallel regions share each entry node to
+ detect diverging regions that need to be peeled. */
+ std::map<llvm::BasicBlock*, int> entryCounts;
+
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *region = (*i);
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### Adding context save/restore for PR: ";
+ region->dumpNames();
+#endif
+ FixMultiRegionVariables(region);
+ entryCounts[region->entryBB()]++;
+ }
+
+#if 0
+ std::cerr << "### After context code addition:" << std::endl;
+ F.viewCFG();
+#endif
+ std::map<ParallelRegion*, bool> peeledRegion;
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+
+ llvm::ValueToValueMapTy reference_map;
+ ParallelRegion *original = (*i);
+
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### handling region:" << std::endl;
+ original->dumpNames();
+ //F.viewCFGOnly();
+#endif
+
+ /* In case of conditional barriers, the first iteration
+ has to be peeled so we know which branch to execute
+ with the work item loop. In case there are more than one
+ parallel region sharing an entry BB, it's a diverging
+ region.
+
+ Post dominance of entry by exit does not work in case the
+ region is inside a loop and the exit block is in the path
+ towards the loop exit (and the function exit).
+ */
+ bool peelFirst = entryCounts[original->entryBB()] > 1;
+
+ peeledRegion[original] = peelFirst;
+
+ std::pair<llvm::BasicBlock *, llvm::BasicBlock *> l;
+ // the original predecessor nodes of which successor
+ // should be fixed if not peeling
+ BasicBlockVector preds;
+
+ bool unrolled = false;
+ if (peelFirst)
+ {
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### conditional region, peeling the first iteration" << std::endl;
+#endif
+ ParallelRegion *replica =
+ original->replicate(reference_map, ".peeled_wi");
+ replica->chainAfter(original);
+ replica->purge();
+
+ l = std::make_pair(replica->entryBB(), replica->exitBB());
+ }
+ else
+ {
+ llvm::pred_iterator PI =
+ llvm::pred_begin(original->entryBB()),
+ E = llvm::pred_end(original->entryBB());
+
+ for (; PI != E; ++PI)
+ {
+ llvm::BasicBlock *bb = *PI;
+ if (DT->dominates(original->entryBB(), bb) &&
+ (RegionOfBlock(original->entryBB()) ==
+ RegionOfBlock(bb)))
+ continue;
+ preds.push_back(bb);
+ }
+
+#if 0
+ int unrollCount;
+ if (getenv("POCL_WILOOPS_MAX_UNROLL_COUNT") != NULL)
+ unrollCount = atoi(getenv("POCL_WILOOPS_MAX_UNROLL_COUNT"));
+ else
+ unrollCount = 1;
+ /* Find a two's exponent unroll count, if available. */
+ while (unrollCount >= 1)
+ {
+ if (LocalSizeX % unrollCount == 0 &&
+ unrollCount <= LocalSizeX)
+ {
+ break;
+ }
+ unrollCount /= 2;
+ }
+
+ if (unrollCount > 1) {
+ ParallelRegion *prev = original;
+ llvm::BasicBlock *lastBB =
+ AppendIncBlock(original->exitBB(), localIdX);
+ original->AddBlockAfter(lastBB, original->exitBB());
+ original->SetExitBB(lastBB);
+
+ if (AddWIMetadata)
+ original->AddIDMetadata(F.getContext(), 0);
+
+ for (int c = 1; c < unrollCount; ++c)
+ {
+ ParallelRegion *unrolled =
+ original->replicate(reference_map, ".unrolled_wi");
+ unrolled->chainAfter(prev);
+ prev = unrolled;
+ lastBB = unrolled->exitBB();
+ if (AddWIMetadata)
+ unrolled->AddIDMetadata(F.getContext(), c);
+ }
+ unrolled = true;
+ l = std::make_pair(original->entryBB(), lastBB);
+ } else {
+ l = std::make_pair(original->entryBB(), original->exitBB());
+ }
+#else
+ l = std::make_pair(original->entryBB(), original->exitBB());
+#endif
+ }
+
+ l = CreateLoopAround(*original, l.first, l.second, peelFirst, localIdX,
+ LocalSizeX, true, lsizeX);
+ if (maxDim > 1)
+ l = CreateLoopAround(*original, l.first, l.second, false, localIdY,
+ LocalSizeY, true, lsizeY);
+ if (maxDim > 2)
+ l = CreateLoopAround(*original, l.first, l.second, false, localIdZ,
+ LocalSizeZ, true, lsizeZ);
+
+ /* Loop edges coming from another region mean B-loops which means
+ we have to fix the loop edge to jump to the beginning of the wi-loop
+ structure, not its body. This has to be done only for non-peeled
+ blocks as the semantics is correct in the other case (the jump is
+ to the beginning of the peeled iteration). */
+ if (!peelFirst)
+ {
+ for (BasicBlockVector::iterator i = preds.begin();
+ i != preds.end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ bb->getTerminator()->replaceUsesOfWith
+ (original->entryBB(), l.first);
+ }
+ }
+ }
+
+ // for the peeled regions we need to add a prologue
+ // that initializes the local ids and the first iteration
+ // counter
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *pr = (*i);
+
+ if (!peeledRegion[pr]) continue;
+ pr->insertPrologue(0, 0, 0);
+ builder.SetInsertPoint(pr->entryBB()->getFirstInsertionPt());
+ builder.CreateStore
+ (ConstantInt::get(IntegerType::get(F.getContext(), size_t_width), 1),
+ localIdXFirstVar);
+ }
+
+ // Creating lsize* values have been hoisted up
+ // K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ);
+ llvm::Instruction *inspt = F.getEntryBlock().getFirstNonPHI();
+ inspt->getParent()->getInstList().insert(inspt, lsizeX);
+ if (maxDim > 1)
+ inspt->getParent()->getInstList().insert(inspt, lsizeY);
+ if (maxDim > 2)
+ inspt->getParent()->getInstList().insert(inspt, lsizeZ);
+ // llvm::GlobalVariable *gvx = M->getGlobalVariable("_local_size_x");
+ // llvm::GlobalVariable *gvy = M->getGlobalVariable("_local_size_y");
+ // llvm::GlobalVariable *gvz = M->getGlobalVariable("_local_size_z");
+ // llvm::Instruction *storex = new StoreInst(lsizeX, gvx, inspt);
+ // llvm::Instruction *storey = new StoreInst(lsizeY, gvy, inspt);
+ // llvm::Instruction *storez = new StoreInst(lsizeZ, gvz, inspt);
+
+
+ ParallelRegion::insertLocalIdInit(&F.getEntryBlock(), 0, 0, 0);
+
+#if 0
+ F.viewCFG();
+#endif
+
+ return true;
+}
+
+/*
+ * Add context save/restore code to variables that are defined in
+ * the given region and are used outside the region.
+ *
+ * Each such variable gets a slot in the stack frame. The variable
+ * is restored from the stack whenever it's used.
+ *
+ */
+void
+WorkitemLoops::FixMultiRegionVariables(ParallelRegion *region)
+{
+ InstructionIndex instructionsInRegion;
+ InstructionVec instructionsToFix;
+
+ /* Construct an index of the region's instructions so it's
+ fast to figure out if the variable uses are all
+ in the region. */
+ for (BasicBlockVector::iterator i = region->begin();
+ i != region->end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ for (llvm::BasicBlock::iterator instr = bb->begin();
+ instr != bb->end(); ++instr)
+ {
+ llvm::Instruction *instruction = instr;
+ instructionsInRegion.insert(instruction);
+ }
+ }
+
+ /* Find all the instructions that define new values and
+ check if they need to be context saved. */
+ for (BasicBlockVector::iterator i = region->begin();
+ i != region->end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ for (llvm::BasicBlock::iterator instr = bb->begin();
+ instr != bb->end(); ++instr)
+ {
+ llvm::Instruction *instruction = instr;
+
+ if (ShouldNotBeContextSaved(instr)) continue;
+
+ for (Instruction::use_iterator ui = instruction->use_begin(),
+ ue = instruction->use_end();
+ ui != ue; ++ui)
+ {
+ Instruction *user;
+ if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue;
+ // if the instruction is used outside this region inside another
+ // region (not in a regionless BB like the B-loop construct BBs),
+ // need to context save it.
+ if (instructionsInRegion.find(user) == instructionsInRegion.end() &&
+ RegionOfBlock(user->getParent()) != NULL)
+ {
+ instructionsToFix.push_back(instruction);
+ break;
+ }
+ }
+ }
+ }
+
+ /* Finally, fix the instructions. */
+ for (InstructionVec::iterator i = instructionsToFix.begin();
+ i != instructionsToFix.end(); ++i)
+ {
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### adding context/save restore for" << std::endl;
+ (*i)->dump();
+#endif
+ llvm::Instruction *instructionToFix = *i;
+ AddContextSaveRestore(instructionToFix);
+ }
+}
+
+llvm::Instruction *
+WorkitemLoops::AddContextSave
+(llvm::Instruction *instruction, llvm::Instruction *alloca)
+{
+
+ if (isa<AllocaInst>(instruction))
+ {
+ /* If the variable to be context saved is itself an alloca,
+ we have created one big alloca that stores the data of all the
+ work-items and return pointers to that array. Thus, we need
+ no initialization code other than the context data alloca itself. */
+ return NULL;
+ }
+
+ /* Save the produced variable to the array. */
+ BasicBlock::iterator definition = dyn_cast<Instruction>(instruction);
+
+ ++definition;
+ while (isa<PHINode>(definition)) ++definition;
+
+ IRBuilder<> builder(definition);
+ std::vector<llvm::Value *> gepArgs;
+
+ /* Reuse the id loads earlier in the region, if possible, to
+ avoid messy output with lots of redundant loads. */
+ ParallelRegion *region = RegionOfBlock(instruction->getParent());
+ assert ("Adding context save outside any region produces illegal code." &&
+ region != NULL);
+
+// linearize index computation for store into alloca
+// alloca[idz * sizey*sizex + idy * sizex + idx]
+ llvm::Value *linear_index = region->LocalIDXLoad();
+ if (maxDim > 1)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDYLoad(),
+ lsizeX) );
+ if (maxDim > 2)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDZLoad(),
+ builder.CreateMul(lsizeY, lsizeX)) );
+ gepArgs.push_back(linear_index);
+
+ return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs));
+
+}
+
+llvm::Instruction *
+WorkitemLoops::AddContextRestore
+(llvm::Value *val, llvm::Instruction *alloca, llvm::Instruction *before,
+ bool isAlloca)
+{
+ assert (val != NULL);
+ IRBuilder<> builder(alloca);
+ if (before != NULL)
+ {
+ builder.SetInsertPoint(before);
+ }
+ else if (isa<Instruction>(val))
+ {
+ builder.SetInsertPoint(dyn_cast<Instruction>(val));
+ before = dyn_cast<Instruction>(val);
+ }
+ else
+ {
+ assert (false && "Unknown context restore location!");
+ }
+
+
+ std::vector<llvm::Value *> gepArgs;
+
+ /* Reuse the id loads earlier in the region, if possible, to
+ avoid messy output with lots of redundant loads. */
+ ParallelRegion *region = RegionOfBlock(before->getParent());
+ assert ("Adding context save outside any region produces illegal code." &&
+ region != NULL);
+
+// linearize alloca loads
+// idz * _local_size_x * _local_size_y + idy * _local_size_x + idx
+ llvm::Value *linear_index = region->LocalIDXLoad();
+ if (maxDim > 1)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDYLoad(),
+ lsizeX) );
+ if (maxDim > 2)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDZLoad(),
+ builder.CreateMul(lsizeY, lsizeX)) );
+ gepArgs.push_back(linear_index);
+
+ llvm::Instruction *gep =
+ dyn_cast<Instruction>(builder.CreateGEP(alloca, gepArgs));
+
+ if (isAlloca) {
+ /* In case the context saved instruction was an alloca, we created a
+ context array with pointed-to elements, and now want to return a pointer
+ to the elements to emulate the original alloca. */
+ return gep;
+ }
+ return builder.CreateLoad(gep);
+}
+
+/**
+ * Returns the context array (alloca) for the given Value, creates it if not
+ * found.
+ */
+llvm::Instruction *
+WorkitemLoops::GetContextArray(llvm::Instruction *instruction)
+{
+
+ /*
+ * Unnamed temp instructions need a generated name for the
+ * context array. Create one using a running integer.
+ */
+ std::ostringstream var;
+ var << ".";
+
+ if (std::string(instruction->getName().str()) != "")
+ {
+ var << instruction->getName().str();
+ }
+ else if (tempInstructionIds.find(instruction) != tempInstructionIds.end())
+ {
+ var << tempInstructionIds[instruction];
+ }
+ else
+ {
+ tempInstructionIds[instruction] = tempInstructionIndex++;
+ var << tempInstructionIds[instruction];
+ }
+
+ var << ".pocl_context";
+ std::string varName = var.str();
+
+ if (contextArrays.find(varName) != contextArrays.end())
+ return contextArrays[varName];
+
+ IRBuilder<> builder(instruction->getParent()->getParent()->getEntryBlock().getFirstInsertionPt());
+
+ llvm::Type *elementType;
+ if (isa<AllocaInst>(instruction))
+ {
+ /* If the variable to be context saved was itself an alloca,
+ create one big alloca that stores the data of all the
+ work-items and directly return pointers to that array.
+ This enables moving all the allocas to the entry node without
+ breaking the parallel loop.
+ Otherwise we would rely on a dynamic alloca to allocate
+ unique stack space to all the work-items when its wiloop
+ iteration is executed. */
+ elementType =
+ dyn_cast<AllocaInst>(instruction)->getType()->getElementType();
+ }
+ else
+ {
+ elementType = instruction->getType();
+ }
+
+// parameterize alloca to be based on _local_size_{x,y,z}
+ llvm::Value *wgsize = lsizeX;
+ if (maxDim > 1) wgsize = builder.CreateMul(wgsize, lsizeY);
+ if (maxDim > 2) wgsize = builder.CreateMul(wgsize, lsizeZ);
+ llvm::Type *contextArrayType = ArrayType::get(elementType, 1);
+ llvm::Instruction *alloca =
+ builder.CreateAlloca(elementType, wgsize, varName);
+
+ contextArrays[varName] = alloca;
+ return alloca;
+}
+
+
+/**
+ * Adds context save/restore code for the value produced by the
+ * given instruction.
+ *
+ * TODO: add only one restore per variable per region.
+ * TODO: add only one load of the id variables per region.
+ * Could be done by having a context restore BB in the beginning of the
+ * region and a context save BB at the end.
+ * TODO: ignore work group variables completely (the iteration variables)
+ * The LLVM should optimize these away but it would improve
+ * the readability of the output during debugging.
+ * TODO: rematerialize some values such as extended values of global
+ * variables (especially global id which is computed from local id) or kernel
+ * argument values instead of allocating stack space for them
+ */
+void
+WorkitemLoops::AddContextSaveRestore
+(llvm::Instruction *instruction) {
+
+ /* Allocate the context data array for the variable. */
+ llvm::Instruction *alloca = GetContextArray(instruction);
+ llvm::Instruction *theStore = AddContextSave(instruction, alloca);
+
+ InstructionVec uses;
+ /* Restore the produced variable before each use to ensure the correct context
+ copy is used.
+
+ We could add the restore only to other regions outside the
+ variable defining region and use the original variable in the defining
+ region due to the SSA virtual registers being unique. However,
+ alloca variables can be redefined also in the same region, thus we
+ need to ensure the correct alloca context position is written, not
+ the original unreplicated one. These variables can be generated by
+ volatile variables, private arrays, and due to the PHIs to allocas
+ pass.
+ */
+
+ /* Find out the uses to fix first as fixing them invalidates
+ the iterator. */
+ for (Instruction::use_iterator ui = instruction->use_begin(),
+ ue = instruction->use_end();
+ ui != ue; ++ui)
+ {
+ Instruction *user;
+ if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue;
+ if (user == theStore) continue;
+ uses.push_back(user);
+ }
+
+ for (InstructionVec::iterator i = uses.begin(); i != uses.end(); ++i)
+ {
+ Instruction *user = *i;
+ Instruction *contextRestoreLocation = user;
+ /* If the user is in a block that doesn't belong to a region,
+ the variable itself must be a "work group variable", that is,
+ not dependent on the work item. Most likely an iteration
+ variable of a for loop with a barrier. */
+ if (RegionOfBlock(user->getParent()) == NULL) continue;
+
+ PHINode* phi = dyn_cast<PHINode>(user);
+ if (phi != NULL)
+ {
+ /* In case of PHI nodes, we cannot just insert the context
+ restore code before it in the same basic block because it is
+ assumed there are no non-phi Instructions before PHIs which
+ the context restore code constitutes to. Add the context
+ restore to the incomingBB instead.
+
+ There can be values in the PHINode that are incoming
+ from another region even though the decision BB is within the region.
+ For those values we need to add the context restore code in the
+ incoming BB (which is known to be inside the region due to the
+ assumption of not having to touch PHI nodes in PRentry BBs).
+ */
+
+ /* PHINodes at region entries are broken down earlier. */
+ assert ("Cannot add context restore for a PHI node at the region entry!" &&
+ RegionOfBlock(phi->getParent())->entryBB() != phi->getParent());
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### adding context restore code before PHI" << std::endl;
+ user->dump();
+ std::cerr << "### in BB:" << std::endl;
+ user->getParent()->dump();
+#endif
+ BasicBlock *incomingBB = NULL;
+ for (unsigned incoming = 0; incoming < phi->getNumIncomingValues();
+ ++incoming)
+ {
+ Value *val = phi->getIncomingValue(incoming);
+ BasicBlock *bb = phi->getIncomingBlock(incoming);
+ if (val == instruction) incomingBB = bb;
+ }
+ assert (incomingBB != NULL);
+ contextRestoreLocation = incomingBB->getTerminator();
+ }
+ llvm::Value *loadedValue =
+ AddContextRestore
+ (user, alloca, contextRestoreLocation, isa<AllocaInst>(instruction));
+ user->replaceUsesOfWith(instruction, loadedValue);
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### done, the user was converted to:" << std::endl;
+ user->dump();
+#endif
+ }
+}
+
+bool
+WorkitemLoops::ShouldNotBeContextSaved(llvm::Instruction *instr)
+{
+ /*
+ _local_id loads should not be replicated as it leads to
+ problems in conditional branch case where the header node
+ of the region is shared across the branches and thus the
+ header node's ID loads might get context saved which leads
+ to egg-chicken problems.
+ */
+ llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr);
+ if (load != NULL &&
+ (load->getPointerOperand() == localIdZ ||
+ load->getPointerOperand() == localIdY ||
+ load->getPointerOperand() == localIdX))
+ return true;
+ return false;
+}
+
+llvm::BasicBlock *
+WorkitemLoops::AppendIncBlock
+(llvm::BasicBlock* after, llvm::Value *localIdVar)
+{
+ llvm::LLVMContext &C = after->getContext();
+
+ llvm::BasicBlock *oldExit = after->getTerminator()->getSuccessor(0);
+ assert (oldExit != NULL);
+
+ llvm::BasicBlock *forIncBB =
+ BasicBlock::Create(C, "pregion.for.inc", after->getParent());
+
+ after->getTerminator()->replaceUsesOfWith(oldExit, forIncBB);
+
+ IRBuilder<> builder(oldExit);
+
+ builder.SetInsertPoint(forIncBB);
+ /* Create the iteration variable increment */
+ builder.CreateStore
+ (builder.CreateAdd
+ (builder.CreateLoad(localIdVar),
+ ConstantInt::get(IntegerType::get(C, size_t_width), 1)),
+ localIdVar);
+
+ builder.CreateBr(oldExit);
+
+ return forIncBB;
+}