diff options
Diffstat (limited to 'src/llvmopencl/WorkitemLoops.cc')
-rw-r--r-- | src/llvmopencl/WorkitemLoops.cc | 1061 |
1 files changed, 1061 insertions, 0 deletions
diff --git a/src/llvmopencl/WorkitemLoops.cc b/src/llvmopencl/WorkitemLoops.cc new file mode 100644 index 0000000..91eb055 --- /dev/null +++ b/src/llvmopencl/WorkitemLoops.cc @@ -0,0 +1,1061 @@ +// LLVM function pass to create loops that run all the work items +// in a work group while respecting barrier synchronization points. +// +// Copyright (c) 2012-2014 Pekka Jääskeläinen / Tampere University of Technology +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#define DEBUG_TYPE "workitem-loops" + +#include "WorkitemLoops.h" +#include "Workgroup.h" +#include "Barrier.h" +#include "Kernel.h" +#include "config.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Support/CommandLine.h" +#ifdef LLVM_3_1 +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/TypeBuilder.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/ValueSymbolTable.h" +#elif defined LLVM_3_2 +#include "llvm/IRBuilder.h" +#include "llvm/TypeBuilder.h" +#include "llvm/DataLayout.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/ValueSymbolTable.h" +#else +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/TypeBuilder.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueSymbolTable.h" +#endif +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#include <llvm/Support/InstIterator.h> +#include "WorkitemHandlerChooser.h" + +#include <iostream> +#include <map> +#include <sstream> +#include <vector> + +//#define DUMP_RESULT_CFG + +#ifdef DUMP_RESULT_CFG +#include "llvm/Analysis/CFGPrinter.h" +#endif + +//#define DEBUG_WORK_ITEM_LOOPS + +using namespace llvm; +using namespace pocl; + +namespace { + static + RegisterPass<WorkitemLoops> X("workitemloops", + "Workitem loop generation pass"); +} + +char WorkitemLoops::ID = 0; + +void +WorkitemLoops::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<DominatorTree>(); + AU.addRequired<PostDominatorTree>(); + AU.addRequired<LoopInfo>(); +// TODO - Removed due to compilation error +#if 0 +#ifdef LLVM_3_1 + AU.addRequired<TargetData>(); +#else + AU.addRequired<DataLayout>(); +#endif +#endif + AU.addRequired<pocl::WorkitemHandlerChooser>(); +} + +bool +WorkitemLoops::runOnFunction(Function &F) +{ + if (!Workgroup::isKernelToProcess(F)) + return false; + + if (getAnalysis<pocl::WorkitemHandlerChooser>().chosenHandler() != + pocl::WorkitemHandlerChooser::POCL_WIH_LOOPS) + return false; + + DT = &getAnalysis<DominatorTree>(); + LI = &getAnalysis<LoopInfo>(); + PDT = &getAnalysis<PostDominatorTree>(); + + tempInstructionIndex = 0; + +#if 0 + std::cerr << "### original:" << std::endl; + F.viewCFG(); +#endif + + bool changed = ProcessFunction(F); +#ifdef DUMP_RESULT_CFG + FunctionPass* cfgPrinter = createCFGOnlyPrinterPass(); + cfgPrinter->runOnFunction(F); +#endif + +#if 0 + std::cerr << "### after:" << std::endl; + F.viewCFG(); +#endif + + changed |= fixUndominatedVariableUses(DT, F); + +#if 0 + /* Split large BBs so we can print the Dot without it crashing. */ + bool fchanged = false; + const int MAX_INSTRUCTIONS_PER_BB = 70; + do { + fchanged = false; + for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) { + BasicBlock *b = i; + + if (b->size() > MAX_INSTRUCTIONS_PER_BB + 1) + { + int count = 0; + BasicBlock::iterator splitPoint = b->begin(); + while (count < MAX_INSTRUCTIONS_PER_BB || isa<PHINode>(splitPoint)) + { + ++splitPoint; + ++count; + } + SplitBlock(b, splitPoint, this); + fchanged = true; + break; + } + } + + } while (fchanged); + + F.viewCFG(); +#endif + contextArrays.clear(); + tempInstructionIds.clear(); + + return changed; +} + +std::pair<llvm::BasicBlock *, llvm::BasicBlock *> +WorkitemLoops::CreateLoopAround +(ParallelRegion ®ion, + llvm::BasicBlock *entryBB, llvm::BasicBlock *exitBB, + bool peeledFirst, llvm::Value *localIdVar, size_t LocalSizeForDim, + bool addIncBlock, llvm::Instruction *lsizeDim) +{ + assert (localIdVar != NULL); + + /* + + Generate a structure like this for each loop level (x,y,z): + + for.init: + + ; if peeledFirst is false: + store i32 0, i32* %_local_id_x, align 4 + + ; if peeledFirst is true (assume the 0,0,0 iteration has been executed earlier) + ; assume _local_id_x_first is is initialized to 1 in the peeled pregion copy + store _local_id_x_first, i32* %_local_id_x, align 4 + store i32 0, %_local_id_x_first + + br label %for.body + + for.body: + + ; the parallel region code here + + br label %for.inc + + for.inc: + + ; Separated inc and cond check blocks for easier loop unrolling later on. + ; Can then chain N times for.body+for.inc to unroll. + + %2 = load i32* %_local_id_x, align 4 + %inc = add nsw i32 %2, 1 + + store i32 %inc, i32* %_local_id_x, align 4 + br label %for.cond + + for.cond: + + ; loop header, compare the id to the local size + %0 = load i32* %_local_id_x, align 4 + %cmp = icmp ult i32 %0, i32 123 + br i1 %cmp, label %for.body, label %for.end + + for.end: + + OPTIMIZE: Use a separate iteration variable across all the loops to iterate the context + data arrays to avoid needing multiplications to find the correct location, and to + enable easy vectorization of loading the context data when there are parallel iterations. + */ + + llvm::BasicBlock *loopBodyEntryBB = entryBB; + llvm::LLVMContext &C = loopBodyEntryBB->getContext(); + llvm::Function *F = loopBodyEntryBB->getParent(); + loopBodyEntryBB->setName("pregion.for.body"); + + assert (exitBB->getTerminator()->getNumSuccessors() == 1); + + llvm::BasicBlock *oldExit = exitBB->getTerminator()->getSuccessor(0); + + llvm::BasicBlock *forInitBB = + BasicBlock::Create(C, "pregion.for.init", F, loopBodyEntryBB); + + llvm::BasicBlock *loopEndBB = + BasicBlock::Create(C, "pregion.for.end", F, exitBB); + + llvm::BasicBlock *forCondBB = + BasicBlock::Create(C, "pregion.for.cond", F, exitBB); + + DT->runOnFunction(*F); + + // F->viewCFG(); + /* Fix the old edges jumping to the region to jump to the basic block + that starts the created loop. Back edges should still point to the + old basic block so we preserve the old loops. */ + BasicBlockVector preds; + llvm::pred_iterator PI = + llvm::pred_begin(entryBB), + E = llvm::pred_end(entryBB); + + for (; PI != E; ++PI) + { + llvm::BasicBlock *bb = *PI; + preds.push_back(bb); + } + + for (BasicBlockVector::iterator i = preds.begin(); + i != preds.end(); ++i) + { + llvm::BasicBlock *bb = *i; + /* Do not fix loop edges inside the region. The loop + is replicated as a whole to the body of the wi-loop.*/ + if (DT->dominates(loopBodyEntryBB, bb)) + continue; + bb->getTerminator()->replaceUsesOfWith(loopBodyEntryBB, forInitBB); + } + + IRBuilder<> builder(forInitBB); + + if (peeledFirst) + { + builder.CreateStore(builder.CreateLoad(localIdXFirstVar), localIdVar); + builder.CreateStore + (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdXFirstVar); + } + else + { + builder.CreateStore + (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdVar); + } + + builder.CreateBr(loopBodyEntryBB); + + exitBB->getTerminator()->replaceUsesOfWith(oldExit, forCondBB); + if (addIncBlock) + { + AppendIncBlock(exitBB, localIdVar); + } + + builder.SetInsertPoint(forCondBB); + + llvm::Value *cmpResult; + if (lsizeDim == NULL) + { + cmpResult = + builder.CreateICmpULT + (builder.CreateLoad(localIdVar), + (ConstantInt::get + (IntegerType::get(C, size_t_width), + LocalSizeForDim)) + ); + } + else + { + cmpResult = + builder.CreateICmpULT + (builder.CreateLoad(localIdVar), + lsizeDim + ); + } + + Instruction *loopBranch = + builder.CreateCondBr(cmpResult, loopBodyEntryBB, loopEndBB); + + /* Add the metadata to mark a parallel loop. The metadata + refer to a loop-unique dummy metadata that is not merged + automatically. */ + + /* This creation of the identifier metadata is copied from + LLVM's MDBuilder::createAnonymousTBAARoot(). */ + MDNode *Dummy = MDNode::getTemporary(C, ArrayRef<Value*>()); + MDNode *Root = MDNode::get(C, Dummy); + // At this point we have + // !0 = metadata !{} <- dummy + // !1 = metadata !{metadata !0} <- root + // Replace the dummy operand with the root node itself and delete the dummy. + Root->replaceOperandWith(0, Root); + MDNode::deleteTemporary(Dummy); + // We now have + // !1 = metadata !{metadata !1} <- self-referential root + + loopBranch->setMetadata("llvm.loop.parallel", Root); + region.AddParallelLoopMetadata(Root); + + builder.SetInsertPoint(loopEndBB); + builder.CreateBr(oldExit); + + return std::make_pair(forInitBB, loopEndBB); +} + +ParallelRegion* +WorkitemLoops::RegionOfBlock(llvm::BasicBlock *bb) +{ + for (ParallelRegion::ParallelRegionVector::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) + { + ParallelRegion *region = (*i); + if (region->HasBlock(bb)) return region; + } + return NULL; +} + +// PreAnalyze kernel function, find out dimension (borrowed from wga) +// PreCreate local sizes which are workgroup invariant +void WorkitemLoops::FindKernelDim(Function &F) +{ + maxDim = 1; + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) + if (CallInst * callInst = dyn_cast<CallInst>(&*I)) + { + if (!callInst->getCalledFunction()) continue; + std::string functionName(callInst->getCalledFunction()->getName()); + + if (functionName == "get_local_id" || + functionName == "get_global_id") + { + Value *arg = callInst->getArgOperand(0); + if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg)) + { + unsigned int dimIdx = constInt->getSExtValue(); + dimIdx = (MAX_DIMENSIONS-1 < dimIdx) ? MAX_DIMENSIONS-1 : dimIdx; + maxDim = (maxDim < dimIdx + 1) ? dimIdx+1 : maxDim; + } + + /*------------------------------------------------------------- + * if the work group function has a variable argument, then + * assume worst case and return 3 loop levels are needed. + *------------------------------------------------------------*/ + else + { + maxDim = 3; + break; + } + } + } + + llvm::Module *M = F.getParent(); + llvm::Type *Int32 = IntegerType::get(M->getContext(), 32); + FunctionType *ft = FunctionType::get + (/*Result=*/ Int32, + /*Params=*/ Int32, + /*isVarArg=*/ false); + Function *f_localsize = + dyn_cast<Function>(M->getOrInsertFunction("get_local_size", ft)); + SmallVector<Value *, 4> argsx, argsy, argsz; + argsx.push_back(ConstantInt::get(Int32, 0)); + lsizeX = CallInst::Create(f_localsize, ArrayRef<Value *>(argsx)); + if (maxDim > 1) + { + argsy.push_back(ConstantInt::get(Int32, 1)); + lsizeY = CallInst::Create(f_localsize, ArrayRef<Value *>(argsy)); + } + if (maxDim > 2) + { + argsz.push_back(ConstantInt::get(Int32, 2)); + lsizeZ = CallInst::Create(f_localsize, ArrayRef<Value *>(argsz)); + } +} + +bool +WorkitemLoops::ProcessFunction(Function &F) +{ + Kernel *K = cast<Kernel> (&F); + Initialize(K); + +#if 0 // TODO: do something for reqd_work_group_size + unsigned workItemCount = LocalSizeX*LocalSizeY*LocalSizeZ; + if (workItemCount == 1) + { + K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ); + ParallelRegion::insertLocalIdInit(&F.getEntryBlock(), 0, 0, 0); + return true; + } +#endif + + FindKernelDim(F); + + original_parallel_regions = + K->getParallelRegions(LI); + + IRBuilder<> builder(F.getEntryBlock().getFirstInsertionPt()); + localIdXFirstVar = + builder.CreateAlloca + (IntegerType::get(F.getContext(), size_t_width), 0, ".pocl.local_id_x_init"); + + // F.viewCFGOnly(); + +#if 0 + std::cerr << "### Original" << std::endl; + F.viewCFG(); +#endif + +#if 0 + for (ParallelRegion::ParallelRegionVector::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) + { + ParallelRegion *region = (*i); + region->InjectRegionPrintF(); + region->InjectVariablePrintouts(); + } +#endif + + /* Count how many parallel regions share each entry node to + detect diverging regions that need to be peeled. */ + std::map<llvm::BasicBlock*, int> entryCounts; + + for (ParallelRegion::ParallelRegionVector::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) + { + ParallelRegion *region = (*i); +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### Adding context save/restore for PR: "; + region->dumpNames(); +#endif + FixMultiRegionVariables(region); + entryCounts[region->entryBB()]++; + } + +#if 0 + std::cerr << "### After context code addition:" << std::endl; + F.viewCFG(); +#endif + std::map<ParallelRegion*, bool> peeledRegion; + for (ParallelRegion::ParallelRegionVector::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) + { + + llvm::ValueToValueMapTy reference_map; + ParallelRegion *original = (*i); + +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### handling region:" << std::endl; + original->dumpNames(); + //F.viewCFGOnly(); +#endif + + /* In case of conditional barriers, the first iteration + has to be peeled so we know which branch to execute + with the work item loop. In case there are more than one + parallel region sharing an entry BB, it's a diverging + region. + + Post dominance of entry by exit does not work in case the + region is inside a loop and the exit block is in the path + towards the loop exit (and the function exit). + */ + bool peelFirst = entryCounts[original->entryBB()] > 1; + + peeledRegion[original] = peelFirst; + + std::pair<llvm::BasicBlock *, llvm::BasicBlock *> l; + // the original predecessor nodes of which successor + // should be fixed if not peeling + BasicBlockVector preds; + + bool unrolled = false; + if (peelFirst) + { +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### conditional region, peeling the first iteration" << std::endl; +#endif + ParallelRegion *replica = + original->replicate(reference_map, ".peeled_wi"); + replica->chainAfter(original); + replica->purge(); + + l = std::make_pair(replica->entryBB(), replica->exitBB()); + } + else + { + llvm::pred_iterator PI = + llvm::pred_begin(original->entryBB()), + E = llvm::pred_end(original->entryBB()); + + for (; PI != E; ++PI) + { + llvm::BasicBlock *bb = *PI; + if (DT->dominates(original->entryBB(), bb) && + (RegionOfBlock(original->entryBB()) == + RegionOfBlock(bb))) + continue; + preds.push_back(bb); + } + +#if 0 + int unrollCount; + if (getenv("POCL_WILOOPS_MAX_UNROLL_COUNT") != NULL) + unrollCount = atoi(getenv("POCL_WILOOPS_MAX_UNROLL_COUNT")); + else + unrollCount = 1; + /* Find a two's exponent unroll count, if available. */ + while (unrollCount >= 1) + { + if (LocalSizeX % unrollCount == 0 && + unrollCount <= LocalSizeX) + { + break; + } + unrollCount /= 2; + } + + if (unrollCount > 1) { + ParallelRegion *prev = original; + llvm::BasicBlock *lastBB = + AppendIncBlock(original->exitBB(), localIdX); + original->AddBlockAfter(lastBB, original->exitBB()); + original->SetExitBB(lastBB); + + if (AddWIMetadata) + original->AddIDMetadata(F.getContext(), 0); + + for (int c = 1; c < unrollCount; ++c) + { + ParallelRegion *unrolled = + original->replicate(reference_map, ".unrolled_wi"); + unrolled->chainAfter(prev); + prev = unrolled; + lastBB = unrolled->exitBB(); + if (AddWIMetadata) + unrolled->AddIDMetadata(F.getContext(), c); + } + unrolled = true; + l = std::make_pair(original->entryBB(), lastBB); + } else { + l = std::make_pair(original->entryBB(), original->exitBB()); + } +#else + l = std::make_pair(original->entryBB(), original->exitBB()); +#endif + } + + l = CreateLoopAround(*original, l.first, l.second, peelFirst, localIdX, + LocalSizeX, true, lsizeX); + if (maxDim > 1) + l = CreateLoopAround(*original, l.first, l.second, false, localIdY, + LocalSizeY, true, lsizeY); + if (maxDim > 2) + l = CreateLoopAround(*original, l.first, l.second, false, localIdZ, + LocalSizeZ, true, lsizeZ); + + /* Loop edges coming from another region mean B-loops which means + we have to fix the loop edge to jump to the beginning of the wi-loop + structure, not its body. This has to be done only for non-peeled + blocks as the semantics is correct in the other case (the jump is + to the beginning of the peeled iteration). */ + if (!peelFirst) + { + for (BasicBlockVector::iterator i = preds.begin(); + i != preds.end(); ++i) + { + llvm::BasicBlock *bb = *i; + bb->getTerminator()->replaceUsesOfWith + (original->entryBB(), l.first); + } + } + } + + // for the peeled regions we need to add a prologue + // that initializes the local ids and the first iteration + // counter + for (ParallelRegion::ParallelRegionVector::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) + { + ParallelRegion *pr = (*i); + + if (!peeledRegion[pr]) continue; + pr->insertPrologue(0, 0, 0); + builder.SetInsertPoint(pr->entryBB()->getFirstInsertionPt()); + builder.CreateStore + (ConstantInt::get(IntegerType::get(F.getContext(), size_t_width), 1), + localIdXFirstVar); + } + + // Creating lsize* values have been hoisted up + // K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ); + llvm::Instruction *inspt = F.getEntryBlock().getFirstNonPHI(); + inspt->getParent()->getInstList().insert(inspt, lsizeX); + if (maxDim > 1) + inspt->getParent()->getInstList().insert(inspt, lsizeY); + if (maxDim > 2) + inspt->getParent()->getInstList().insert(inspt, lsizeZ); + // llvm::GlobalVariable *gvx = M->getGlobalVariable("_local_size_x"); + // llvm::GlobalVariable *gvy = M->getGlobalVariable("_local_size_y"); + // llvm::GlobalVariable *gvz = M->getGlobalVariable("_local_size_z"); + // llvm::Instruction *storex = new StoreInst(lsizeX, gvx, inspt); + // llvm::Instruction *storey = new StoreInst(lsizeY, gvy, inspt); + // llvm::Instruction *storez = new StoreInst(lsizeZ, gvz, inspt); + + + ParallelRegion::insertLocalIdInit(&F.getEntryBlock(), 0, 0, 0); + +#if 0 + F.viewCFG(); +#endif + + return true; +} + +/* + * Add context save/restore code to variables that are defined in + * the given region and are used outside the region. + * + * Each such variable gets a slot in the stack frame. The variable + * is restored from the stack whenever it's used. + * + */ +void +WorkitemLoops::FixMultiRegionVariables(ParallelRegion *region) +{ + InstructionIndex instructionsInRegion; + InstructionVec instructionsToFix; + + /* Construct an index of the region's instructions so it's + fast to figure out if the variable uses are all + in the region. */ + for (BasicBlockVector::iterator i = region->begin(); + i != region->end(); ++i) + { + llvm::BasicBlock *bb = *i; + for (llvm::BasicBlock::iterator instr = bb->begin(); + instr != bb->end(); ++instr) + { + llvm::Instruction *instruction = instr; + instructionsInRegion.insert(instruction); + } + } + + /* Find all the instructions that define new values and + check if they need to be context saved. */ + for (BasicBlockVector::iterator i = region->begin(); + i != region->end(); ++i) + { + llvm::BasicBlock *bb = *i; + for (llvm::BasicBlock::iterator instr = bb->begin(); + instr != bb->end(); ++instr) + { + llvm::Instruction *instruction = instr; + + if (ShouldNotBeContextSaved(instr)) continue; + + for (Instruction::use_iterator ui = instruction->use_begin(), + ue = instruction->use_end(); + ui != ue; ++ui) + { + Instruction *user; + if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue; + // if the instruction is used outside this region inside another + // region (not in a regionless BB like the B-loop construct BBs), + // need to context save it. + if (instructionsInRegion.find(user) == instructionsInRegion.end() && + RegionOfBlock(user->getParent()) != NULL) + { + instructionsToFix.push_back(instruction); + break; + } + } + } + } + + /* Finally, fix the instructions. */ + for (InstructionVec::iterator i = instructionsToFix.begin(); + i != instructionsToFix.end(); ++i) + { +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### adding context/save restore for" << std::endl; + (*i)->dump(); +#endif + llvm::Instruction *instructionToFix = *i; + AddContextSaveRestore(instructionToFix); + } +} + +llvm::Instruction * +WorkitemLoops::AddContextSave +(llvm::Instruction *instruction, llvm::Instruction *alloca) +{ + + if (isa<AllocaInst>(instruction)) + { + /* If the variable to be context saved is itself an alloca, + we have created one big alloca that stores the data of all the + work-items and return pointers to that array. Thus, we need + no initialization code other than the context data alloca itself. */ + return NULL; + } + + /* Save the produced variable to the array. */ + BasicBlock::iterator definition = dyn_cast<Instruction>(instruction); + + ++definition; + while (isa<PHINode>(definition)) ++definition; + + IRBuilder<> builder(definition); + std::vector<llvm::Value *> gepArgs; + + /* Reuse the id loads earlier in the region, if possible, to + avoid messy output with lots of redundant loads. */ + ParallelRegion *region = RegionOfBlock(instruction->getParent()); + assert ("Adding context save outside any region produces illegal code." && + region != NULL); + +// linearize index computation for store into alloca +// alloca[idz * sizey*sizex + idy * sizex + idx] + llvm::Value *linear_index = region->LocalIDXLoad(); + if (maxDim > 1) + linear_index = builder.CreateAdd(linear_index, + builder.CreateMul(region->LocalIDYLoad(), + lsizeX) ); + if (maxDim > 2) + linear_index = builder.CreateAdd(linear_index, + builder.CreateMul(region->LocalIDZLoad(), + builder.CreateMul(lsizeY, lsizeX)) ); + gepArgs.push_back(linear_index); + + return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs)); + +} + +llvm::Instruction * +WorkitemLoops::AddContextRestore +(llvm::Value *val, llvm::Instruction *alloca, llvm::Instruction *before, + bool isAlloca) +{ + assert (val != NULL); + IRBuilder<> builder(alloca); + if (before != NULL) + { + builder.SetInsertPoint(before); + } + else if (isa<Instruction>(val)) + { + builder.SetInsertPoint(dyn_cast<Instruction>(val)); + before = dyn_cast<Instruction>(val); + } + else + { + assert (false && "Unknown context restore location!"); + } + + + std::vector<llvm::Value *> gepArgs; + + /* Reuse the id loads earlier in the region, if possible, to + avoid messy output with lots of redundant loads. */ + ParallelRegion *region = RegionOfBlock(before->getParent()); + assert ("Adding context save outside any region produces illegal code." && + region != NULL); + +// linearize alloca loads +// idz * _local_size_x * _local_size_y + idy * _local_size_x + idx + llvm::Value *linear_index = region->LocalIDXLoad(); + if (maxDim > 1) + linear_index = builder.CreateAdd(linear_index, + builder.CreateMul(region->LocalIDYLoad(), + lsizeX) ); + if (maxDim > 2) + linear_index = builder.CreateAdd(linear_index, + builder.CreateMul(region->LocalIDZLoad(), + builder.CreateMul(lsizeY, lsizeX)) ); + gepArgs.push_back(linear_index); + + llvm::Instruction *gep = + dyn_cast<Instruction>(builder.CreateGEP(alloca, gepArgs)); + + if (isAlloca) { + /* In case the context saved instruction was an alloca, we created a + context array with pointed-to elements, and now want to return a pointer + to the elements to emulate the original alloca. */ + return gep; + } + return builder.CreateLoad(gep); +} + +/** + * Returns the context array (alloca) for the given Value, creates it if not + * found. + */ +llvm::Instruction * +WorkitemLoops::GetContextArray(llvm::Instruction *instruction) +{ + + /* + * Unnamed temp instructions need a generated name for the + * context array. Create one using a running integer. + */ + std::ostringstream var; + var << "."; + + if (std::string(instruction->getName().str()) != "") + { + var << instruction->getName().str(); + } + else if (tempInstructionIds.find(instruction) != tempInstructionIds.end()) + { + var << tempInstructionIds[instruction]; + } + else + { + tempInstructionIds[instruction] = tempInstructionIndex++; + var << tempInstructionIds[instruction]; + } + + var << ".pocl_context"; + std::string varName = var.str(); + + if (contextArrays.find(varName) != contextArrays.end()) + return contextArrays[varName]; + + IRBuilder<> builder(instruction->getParent()->getParent()->getEntryBlock().getFirstInsertionPt()); + + llvm::Type *elementType; + if (isa<AllocaInst>(instruction)) + { + /* If the variable to be context saved was itself an alloca, + create one big alloca that stores the data of all the + work-items and directly return pointers to that array. + This enables moving all the allocas to the entry node without + breaking the parallel loop. + Otherwise we would rely on a dynamic alloca to allocate + unique stack space to all the work-items when its wiloop + iteration is executed. */ + elementType = + dyn_cast<AllocaInst>(instruction)->getType()->getElementType(); + } + else + { + elementType = instruction->getType(); + } + +// parameterize alloca to be based on _local_size_{x,y,z} + llvm::Value *wgsize = lsizeX; + if (maxDim > 1) wgsize = builder.CreateMul(wgsize, lsizeY); + if (maxDim > 2) wgsize = builder.CreateMul(wgsize, lsizeZ); + llvm::Type *contextArrayType = ArrayType::get(elementType, 1); + llvm::Instruction *alloca = + builder.CreateAlloca(elementType, wgsize, varName); + + contextArrays[varName] = alloca; + return alloca; +} + + +/** + * Adds context save/restore code for the value produced by the + * given instruction. + * + * TODO: add only one restore per variable per region. + * TODO: add only one load of the id variables per region. + * Could be done by having a context restore BB in the beginning of the + * region and a context save BB at the end. + * TODO: ignore work group variables completely (the iteration variables) + * The LLVM should optimize these away but it would improve + * the readability of the output during debugging. + * TODO: rematerialize some values such as extended values of global + * variables (especially global id which is computed from local id) or kernel + * argument values instead of allocating stack space for them + */ +void +WorkitemLoops::AddContextSaveRestore +(llvm::Instruction *instruction) { + + /* Allocate the context data array for the variable. */ + llvm::Instruction *alloca = GetContextArray(instruction); + llvm::Instruction *theStore = AddContextSave(instruction, alloca); + + InstructionVec uses; + /* Restore the produced variable before each use to ensure the correct context + copy is used. + + We could add the restore only to other regions outside the + variable defining region and use the original variable in the defining + region due to the SSA virtual registers being unique. However, + alloca variables can be redefined also in the same region, thus we + need to ensure the correct alloca context position is written, not + the original unreplicated one. These variables can be generated by + volatile variables, private arrays, and due to the PHIs to allocas + pass. + */ + + /* Find out the uses to fix first as fixing them invalidates + the iterator. */ + for (Instruction::use_iterator ui = instruction->use_begin(), + ue = instruction->use_end(); + ui != ue; ++ui) + { + Instruction *user; + if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue; + if (user == theStore) continue; + uses.push_back(user); + } + + for (InstructionVec::iterator i = uses.begin(); i != uses.end(); ++i) + { + Instruction *user = *i; + Instruction *contextRestoreLocation = user; + /* If the user is in a block that doesn't belong to a region, + the variable itself must be a "work group variable", that is, + not dependent on the work item. Most likely an iteration + variable of a for loop with a barrier. */ + if (RegionOfBlock(user->getParent()) == NULL) continue; + + PHINode* phi = dyn_cast<PHINode>(user); + if (phi != NULL) + { + /* In case of PHI nodes, we cannot just insert the context + restore code before it in the same basic block because it is + assumed there are no non-phi Instructions before PHIs which + the context restore code constitutes to. Add the context + restore to the incomingBB instead. + + There can be values in the PHINode that are incoming + from another region even though the decision BB is within the region. + For those values we need to add the context restore code in the + incoming BB (which is known to be inside the region due to the + assumption of not having to touch PHI nodes in PRentry BBs). + */ + + /* PHINodes at region entries are broken down earlier. */ + assert ("Cannot add context restore for a PHI node at the region entry!" && + RegionOfBlock(phi->getParent())->entryBB() != phi->getParent()); +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### adding context restore code before PHI" << std::endl; + user->dump(); + std::cerr << "### in BB:" << std::endl; + user->getParent()->dump(); +#endif + BasicBlock *incomingBB = NULL; + for (unsigned incoming = 0; incoming < phi->getNumIncomingValues(); + ++incoming) + { + Value *val = phi->getIncomingValue(incoming); + BasicBlock *bb = phi->getIncomingBlock(incoming); + if (val == instruction) incomingBB = bb; + } + assert (incomingBB != NULL); + contextRestoreLocation = incomingBB->getTerminator(); + } + llvm::Value *loadedValue = + AddContextRestore + (user, alloca, contextRestoreLocation, isa<AllocaInst>(instruction)); + user->replaceUsesOfWith(instruction, loadedValue); +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### done, the user was converted to:" << std::endl; + user->dump(); +#endif + } +} + +bool +WorkitemLoops::ShouldNotBeContextSaved(llvm::Instruction *instr) +{ + /* + _local_id loads should not be replicated as it leads to + problems in conditional branch case where the header node + of the region is shared across the branches and thus the + header node's ID loads might get context saved which leads + to egg-chicken problems. + */ + llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr); + if (load != NULL && + (load->getPointerOperand() == localIdZ || + load->getPointerOperand() == localIdY || + load->getPointerOperand() == localIdX)) + return true; + return false; +} + +llvm::BasicBlock * +WorkitemLoops::AppendIncBlock +(llvm::BasicBlock* after, llvm::Value *localIdVar) +{ + llvm::LLVMContext &C = after->getContext(); + + llvm::BasicBlock *oldExit = after->getTerminator()->getSuccessor(0); + assert (oldExit != NULL); + + llvm::BasicBlock *forIncBB = + BasicBlock::Create(C, "pregion.for.inc", after->getParent()); + + after->getTerminator()->replaceUsesOfWith(oldExit, forIncBB); + + IRBuilder<> builder(oldExit); + + builder.SetInsertPoint(forIncBB); + /* Create the iteration variable increment */ + builder.CreateStore + (builder.CreateAdd + (builder.CreateLoad(localIdVar), + ConstantInt::get(IntegerType::get(C, size_t_width), 1)), + localIdVar); + + builder.CreateBr(oldExit); + + return forIncBB; +} |