diff options
Diffstat (limited to 'src/llvmopencl/ParallelRegion.cc')
-rw-r--r-- | src/llvmopencl/ParallelRegion.cc | 809 |
1 files changed, 809 insertions, 0 deletions
diff --git a/src/llvmopencl/ParallelRegion.cc b/src/llvmopencl/ParallelRegion.cc new file mode 100644 index 0000000..72d89c1 --- /dev/null +++ b/src/llvmopencl/ParallelRegion.cc @@ -0,0 +1,809 @@ +// Class definition for parallel regions, a group of BasicBlocks that +// each kernel should run in parallel. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos and +// 2012 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "ParallelRegion.h" +#include "Barrier.h" +#include "Kernel.h" +#include "config.h" +#ifdef LLVM_3_1 +#include "llvm/Support/IRBuilder.h" +#include "llvm/ValueSymbolTable.h" +#elif defined LLVM_3_2 +#include "llvm/IRBuilder.h" +#include "llvm/ValueSymbolTable.h" +#else +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/ValueSymbolTable.h" +#endif +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" + +#include <set> +#include <sstream> +#include <map> +#include <algorithm> + +using namespace std; +using namespace llvm; +using namespace pocl; + +//#define DEBUG_REMAP +//#define DEBUG_REPLICATE +//#define DEBUG_PURGE + +#include <iostream> + +int ParallelRegion::idGen = 0; + + +ParallelRegion::ParallelRegion(int forcedRegionId) : + std::vector<llvm::BasicBlock *>(), + LocalIDXLoadInstr(NULL), LocalIDYLoadInstr(NULL), LocalIDZLoadInstr(NULL), + exitIndex_(0), entryIndex_(0), pRegionId(forcedRegionId) +{ + if (forcedRegionId == -1) + pRegionId = idGen++; +} + +/** + * Ensure all variables are named so they will be replicated and renamed + * correctly. + */ +void +ParallelRegion::GenerateTempNames(llvm::BasicBlock *bb) +{ + for (llvm::BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; ++i) + { + llvm::Instruction *instr = i; + if (instr->hasName() || !instr->isUsedOutsideOfBlock(bb)) continue; + int tempCounter = 0; + std::string tempName = ""; + do { + std::ostringstream name; + name << ".pocl_temp." << tempCounter; + ++tempCounter; + tempName = name.str(); + } while (bb->getParent()->getValueSymbolTable().lookup(tempName) != NULL); + instr->setName(tempName); + } +} + +// BarrierBlock * +// ParallelRegion::getEntryBarrier() +// { +// BasicBlock *entry = front(); +// BasicBlock *barrier = entry->getSinglePredecessor(); + +// return cast<BarrierBlock> (barrier); +// } + +ParallelRegion * +ParallelRegion::replicate(ValueToValueMapTy &map, + const Twine &suffix = "") +{ + ParallelRegion *new_region = new ParallelRegion(pRegionId); + + /* Because ParallelRegions are all replicated before they + are attached to the function, it can happen that + the same BB is replicated multiple times and it gets + the same name (only the BB name will be autorenamed + by LLVM). This causes the variable references to become + broken. This hack ensures the BB suffixes are unique + before cloning so each path gets their own value + names. Split points can be such paths.*/ + static std::map<std::string, int> cloneCounts; + + for (iterator i = begin(), e = end(); i != e; ++i) { + BasicBlock *block = *i; + GenerateTempNames(block); + std::ostringstream suf; + suf << suffix.str(); + std::string block_name = block->getName().str() + "." + suffix.str(); + if (cloneCounts[block_name] > 0) + { + suf << ".pocl_" << cloneCounts[block_name]; + } + BasicBlock *new_block = CloneBasicBlock(block, map, suf.str()); + cloneCounts[block_name]++; + // Insert the block itself into the map. + map[block] = new_block; + new_region->push_back(new_block); + +#ifdef DEBUG_REPLICATE + std::cerr << "### clonee block:" << std::endl; + block->dump(); + std::cerr << endl << "### cloned block: " << std::endl; + new_block->dump(); +#endif + } + + new_region->exitIndex_ = exitIndex_; + new_region->entryIndex_ = entryIndex_; + /* Remap here to get local variables fixed before they + are (possibly) overwritten by another clone of the + same BB. */ + new_region->remap(map); + +#ifdef DEBUG_REPLICATE + Verify(); +#endif + LocalizeIDLoads(); + + return new_region; +} + +void +ParallelRegion::remap(ValueToValueMapTy &map) +{ + for (iterator i = begin(), e = end(); i != e; ++i) { + +#ifdef DEBUG_REMAP + std::cerr << "### block before remap:" << std::endl; + (*i)->dump(); +#endif + + for (BasicBlock::iterator ii = (*i)->begin(), ee = (*i)->end(); + ii != ee; ++ii) + RemapInstruction(ii, map, + RF_IgnoreMissingEntries | RF_NoModuleLevelChanges); + +#ifdef DEBUG_REMAP + std::cerr << endl << "### block after remap: " << std::endl; + (*i)->dump(); +#endif + } +} + +void +ParallelRegion::chainAfter(ParallelRegion *region) +{ + /* If we are replicating a conditional barrier + region, the last block can be an unreachable + block to mark the impossible path. Skip + it and choose the correct branch instead. + + TODO: why have the unreachable block there the + first place? Could we just not add it and fix + the branch? */ + BasicBlock *tail = region->exitBB(); + TerminatorInst *t = tail->getTerminator(); + if (isa<UnreachableInst>(t)) + { + tail = region->at(region->size() - 2); + t = tail->getTerminator(); + } + if (t->getNumSuccessors() != 1) + { + std::cout << "!!! trying to chain region" << std::endl; + this->dumpNames(); + std::cout << "!!! after region" << std::endl; + region->dumpNames(); + t->getParent()->dump(); + + assert (t->getNumSuccessors() == 1); + } + + BasicBlock *successor = t->getSuccessor(0); + Function::BasicBlockListType &bb_list = + successor->getParent()->getBasicBlockList(); + + for (iterator i = begin(), e = end(); i != e; ++i) + bb_list.insertAfter(tail, *i); + + t->setSuccessor(0, entryBB()); + + t = exitBB()->getTerminator(); + assert (t->getNumSuccessors() == 1); + t->setSuccessor(0, successor); +} + +void +ParallelRegion::purge() +{ + SmallVector<BasicBlock *, 4> new_blocks; + + for (iterator i = begin(), e = end(); i != e; ++i) { + + // Exit block has a successor out of the region. + if (*i == exitBB()) + continue; + +#ifdef DEBUG_PURGE + std::cerr << "### block before purge:" << std::endl; + (*i)->dump(); +#endif + TerminatorInst *t = (*i)->getTerminator(); + for (unsigned ii = 0, ee = t->getNumSuccessors(); ii != ee; ++ii) { + BasicBlock *successor = t->getSuccessor(ii); + if (count(begin(), end(), successor) == 0) { + // This successor is not on the parallel region, purge. + iterator next_block = i; + ++next_block; + assert ((*i)->getParent() != NULL && *next_block != NULL); + BasicBlock *unreachable = + BasicBlock::Create((*i)->getContext(), + (*i)->getName() + ".unreachable", + (*i)->getParent(), + *next_block); + new UnreachableInst(unreachable->getContext(), + unreachable); + t->setSuccessor(ii, unreachable); + new_blocks.push_back(unreachable); + } + } +#ifdef DEBUG_PURGE + std::cerr << std::endl << "### block after purge:" << std::endl; + (*i)->dump(); +#endif + } + + // Add the new "unreachable" blocks to the + // region. We cannot do in the loop as it + // corrupts iterators. + insert(end(), new_blocks.begin(), new_blocks.end()); +} + +void +ParallelRegion::insertLocalIdInit(llvm::BasicBlock* entry, + unsigned x, + unsigned y, + unsigned z) +{ + IRBuilder<> builder(entry, entry->getFirstInsertionPt()); + + Module *M = entry->getParent()->getParent(); + + int size_t_width = 32; + if (M->getPointerSize() == llvm::Module::Pointer64) + size_t_width = 64; + + GlobalVariable *gvx = M->getGlobalVariable(POCL_LOCAL_ID_X_GLOBAL); + if (gvx != NULL) + builder.CreateStore(ConstantInt::get(IntegerType:: + get(M->getContext(), size_t_width), + x), gvx); + + GlobalVariable *gvy = M->getGlobalVariable(POCL_LOCAL_ID_Y_GLOBAL); + if (gvy != NULL) + builder.CreateStore(ConstantInt::get(IntegerType:: + get(M->getContext(), size_t_width), + y), gvy); + + GlobalVariable *gvz = M->getGlobalVariable(POCL_LOCAL_ID_Z_GLOBAL); + if (gvz != NULL) + builder.CreateStore(ConstantInt::get(IntegerType:: + get(M->getContext(), size_t_width), + z), gvz); +} + +void +ParallelRegion::insertPrologue(unsigned x, + unsigned y, + unsigned z) +{ + BasicBlock *entry = entryBB(); + ParallelRegion::insertLocalIdInit(entry, x, y, z); +} + +void +ParallelRegion::dump() +{ + for (iterator i = begin(), e = end(); i != e; ++i) + (*i)->dump(); +} + +void +ParallelRegion::dumpNames() +{ + for (iterator i = begin(), e = end(); i != e; ++i) + { + std::cout << (*i)->getName().str(); + if (entryBB() == (*i)) + std::cout << "(EN)"; + if (exitBB() == (*i)) + std::cout << "(EX)"; + std::cout << " "; + } + std::cout << std::endl; +} + +ParallelRegion * +ParallelRegion::Create(const SmallPtrSet<BasicBlock *, 8>& bbs, BasicBlock *entry, BasicBlock *exit) +{ + ParallelRegion *new_region = new ParallelRegion(); + + assert (entry != NULL); + assert (exit != NULL); + + // This is done in two steps so order of the vector + // is the same as original function order. + Function *F = entry->getParent(); + for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) { + BasicBlock *b = i; + for (SmallPtrSetIterator<BasicBlock *> j = bbs.begin(); j != bbs.end(); ++j) { + if (*j == b) { + new_region->push_back(i); + if (entry == *j) + new_region->setEntryBBIndex(new_region->size() - 1); + else if (exit == *j) + new_region->setExitBBIndex(new_region->size() - 1); + break; + } + } + } + + new_region->LocalizeIDLoads(); + + assert(new_region->Verify()); + + return new_region; +} + +bool +ParallelRegion::Verify() +{ + // Parallel region conditions: + // 1) Single entry, in entry block. + // 2) Single outgoing edge from exit block + // (other outgoing edges allowed, will be purged in replicas). + // 3) No barriers inside the region. + + int entry_edges = 0; + + for (iterator i = begin(), e = end(); i != e; ++i) { + for (pred_iterator ii(*i), ee(*i, true); ii != ee; ++ii) { + if (count(begin(), end(), *ii) == 0) { + if ((*i) != entryBB()) { + dumpNames(); + std::cerr << "suspicious block: " << (*i)->getName().str() << std::endl; + std::cerr << "the entry is: " << entryBB()->getName().str() << std::endl; + +#if 0 + (*i)->getParent()->viewCFG(); +#endif + assert(0 && "Incoming edges to non-entry block!"); + return false; + } else if (!Barrier::hasBarrier(*ii)) { + (*i)->getParent()->viewCFG(); + assert (0 && "Entry has edges from non-barrier blocks!"); + return false; + } + ++entry_edges; + } + } + + // if (entry_edges != 1) { + // assert(0 && "Parallel regions must be single entry!"); + // return false; + // } + + if (exitBB()->getTerminator()->getNumSuccessors() != 1) { + assert(0 && "Multiple outgoing edges from exit block!"); + return false; + } + + for (BasicBlock::iterator ii = (*i)->begin(), ee = (*i)->end(); + ii != ee; ++ii) { + if (isa<Barrier> (ii)) { + assert(0 && "Barrier found inside parallel region!"); + return false; + } + } + } + + return true; +} + +/** + * Adds metadata to all the memory instructions to denote + * they originate from a parallel loop. + * + * Due to nested parallel loops, there can be multiple loop + * references. + * + * Format: + * llvm.mem.parallel_loop_access !0 + * + * !0 { metadata !0 } + * + * In a 2-nested loop: + * + * llvm.mem.parallel_loop_access !0 + * + * !0 { metadata !1, metadata !2} + * !1 { metadata !1 } + * !2 { metadata !2 } + */ +void +ParallelRegion::AddParallelLoopMetadata(llvm::MDNode *identifier) { + + for (iterator i = begin(), e = end(); i != e; ++i) { + BasicBlock* bb = *i; + for (BasicBlock::iterator ii = bb->begin(), ee = bb->end(); + ii != ee; ii++) { + if (ii->mayReadOrWriteMemory()) { + std::vector<Value*> loopIds; + MDNode *oldIds = ii->getMetadata("llvm.mem.parallel_loop_access"); + if (oldIds != NULL) { + for (unsigned i = 0; i < oldIds->getNumOperands(); ++i) { + loopIds.push_back(oldIds->getOperand(i)); + } + } + loopIds.push_back(identifier); + ii->setMetadata("llvm.mem.parallel_loop_access", + MDNode::get(bb->getContext(), loopIds)); + } + } + } +} + +void +ParallelRegion::AddIDMetadata( + llvm::LLVMContext& context, + std::size_t x, + std::size_t y, + std::size_t z) { + + int counter = 1; + Value *v1[] = { + MDString::get(context, "WI_region"), + ConstantInt::get(Type::getInt32Ty(context), pRegionId)}; + MDNode* mdRegion = MDNode::get(context, v1); + Value *v2[] = { + MDString::get(context, "WI_xyz"), + ConstantInt::get(Type::getInt32Ty(context), x), + ConstantInt::get(Type::getInt32Ty(context), y), + ConstantInt::get(Type::getInt32Ty(context), z)}; + MDNode* mdXYZ = MDNode::get(context, v2); + Value *v[] = { + MDString::get(context, "WI_data"), + mdRegion, + mdXYZ}; + MDNode* md = MDNode::get(context, v); + + for (iterator i = begin(), e = end(); i != e; ++i) { + BasicBlock* bb = *i; + for (BasicBlock::iterator ii = bb->begin(); + ii != bb->end(); ii++) { + Value *v3[] = { + MDString::get(context, "WI_counter"), + ConstantInt::get(Type::getInt32Ty(context), counter)}; + MDNode* mdCounter = MDNode::get(context, v3); + counter++; + ii->setMetadata("wi", md); + ii->setMetadata("wi_counter", mdCounter); + } + } +} + + +/** + * Inserts a new basic block to the region, before an old basic block in + * the region. + * + * Assumes the inserted block to be before the other block in control + * flow, that is, there should be direct CFG edge from the block to the + * other. + */ +void +ParallelRegion::AddBlockBefore(llvm::BasicBlock *block, llvm::BasicBlock *before) +{ + llvm::BasicBlock *oldExit = exitBB(); + ParallelRegion::iterator beforePos = find(begin(), end(), before); + ParallelRegion::iterator oldExitPos = find(begin(), end(), oldExit); + assert (beforePos != end()); + + /* The old exit node might is now pushed further, at most one position. + Whether this is the case, depends if the node was inserted before or + after that node in the vector. That is, if indexof(before) < indexof(oldExit). */ + if (beforePos < oldExitPos) ++exitIndex_; + + insert(beforePos, block); + /* The entryIndex_ should be still correct. In case the 'before' block + was an old entry node, the new one replaces it as an entry node at + the same index and the old one gets pushed forward. */ +} + + +void +ParallelRegion::AddBlockAfter(llvm::BasicBlock *block, llvm::BasicBlock *after) +{ + llvm::BasicBlock *oldExit = exitBB(); + ParallelRegion::iterator afterPos = find(begin(), end(), after); + ParallelRegion::iterator oldExitPos = find(begin(), end(), oldExit); + assert (afterPos != end()); + + /* The old exit node might be pushed further, at most one position. + Whether this is the case, depends if the node was inserted before or + after that node in the vector. That is, if indexof(before) < indexof(oldExit). */ + if (afterPos < oldExitPos) ++exitIndex_; + afterPos++; + insert(afterPos, block); +} + +bool +ParallelRegion::HasBlock(llvm::BasicBlock *bb) +{ + return find(begin(), end(), bb) != end(); +} + +/** + * Find the instruction that loads the Z dimension of the work item + * in the beginning of the parallel region, if not found, creates it. + */ +llvm::Instruction* +ParallelRegion::LocalIDZLoad() +{ + if (LocalIDZLoadInstr != NULL) return LocalIDZLoadInstr; + IRBuilder<> builder(entryBB()->getFirstInsertionPt()); + return LocalIDZLoadInstr = + builder.CreateLoad + (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_Z_GLOBAL)); +} + +/** + * Find the instruction that loads the Y dimension of the work item + * in the beginning of the parallel region, if not found, creates it. + */ +llvm::Instruction* +ParallelRegion::LocalIDYLoad() +{ + if (LocalIDYLoadInstr != NULL) return LocalIDYLoadInstr; + IRBuilder<> builder(entryBB()->getFirstInsertionPt()); + return LocalIDYLoadInstr = + builder.CreateLoad + (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_Y_GLOBAL)); +} + +/** + * Find the instruction that loads the X dimension of the work item + * in the beginning of the parallel region, if not found, creates it. + */ +llvm::Instruction* +ParallelRegion::LocalIDXLoad() +{ + if (LocalIDXLoadInstr != NULL) return LocalIDXLoadInstr; + IRBuilder<> builder(entryBB()->getFirstInsertionPt()); + return LocalIDXLoadInstr = + builder.CreateLoad + (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_X_GLOBAL)); +} + +void +ParallelRegion::InjectPrintF +(llvm::Instruction *before, std::string formatStr, + std::vector<Value*>& params) +{ + IRBuilder<> builder(before); + llvm::Module *M = before->getParent()->getParent()->getParent(); + + llvm::Value *stringArg = + builder.CreateGlobalString(formatStr); + + /* generated with help from http://llvm.org/demo/index.cgi */ + Function* printfFunc = M->getFunction("printf"); + if (printfFunc == NULL) { + PointerType* PointerTy_4 = PointerType::get(IntegerType::get(M->getContext(), 8), 0); + + std::vector<Type*> FuncTy_6_args; + FuncTy_6_args.push_back(PointerTy_4); + + FunctionType* FuncTy_6 = + FunctionType::get + (/*Result=*/IntegerType::get(M->getContext(), 32), + /*Params=*/FuncTy_6_args, + /*isVarArg=*/true); + + printfFunc = + Function::Create + (/*Type=*/FuncTy_6, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"printf", M); + printfFunc->setCallingConv(CallingConv::C); + +#if (defined LLVM_3_1 or defined LLVM_3_2) + AttrListPtr func_printf_PAL; +#else + AttributeSet func_printf_PAL; +#endif + { +#ifdef LLVM_3_1 + SmallVector<AttributeWithIndex, 4> Attrs; + AttributeWithIndex PAWI; + PAWI.Index = 1U; + PAWI.Attrs = Attribute::NoCapture; + Attrs.push_back(PAWI); + PAWI.Index = 4294967295U; + PAWI.Attrs = Attribute::NoUnwind; + Attrs.push_back(PAWI); + func_printf_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end()); +#elif defined LLVM_3_2 + SmallVector<AttributeWithIndex, 4> Attrs; + Attrs.push_back(AttributeWithIndex::get(M->getContext(), 1U, Attributes::NoCapture)); + Attrs.push_back(AttributeWithIndex::get(M->getContext(), 4294967295U, Attributes::NoUnwind)); + func_printf_PAL = AttrListPtr::get(M->getContext(), Attrs); +#else + func_printf_PAL.addAttribute( M->getContext(), 1U, Attribute::NoCapture); + func_printf_PAL.addAttribute( M->getContext(), 4294967295U, Attribute::NoUnwind); +#endif + } + printfFunc->setAttributes(func_printf_PAL); + } + + std::vector<Constant*> const_ptr_8_indices; + + ConstantInt* const_int64_9 = ConstantInt::get(M->getContext(), APInt(64, StringRef("0"), 10)); + const_ptr_8_indices.push_back(const_int64_9); + const_ptr_8_indices.push_back(const_int64_9); + assert (isa<Constant>(stringArg)); + Constant* const_ptr_8 = + ConstantExpr::getGetElementPtr + (cast<Constant>(stringArg), const_ptr_8_indices); + + std::vector<Value*> args; + args.push_back(const_ptr_8); + args.insert(args.end(), params.begin(), params.end()); + + CallInst::Create(printfFunc, args, "", before); +} + +void +ParallelRegion::SetExitBB(llvm::BasicBlock *block) +{ + for (size_t i = 0; i < size(); ++i) + { + if (at(i) == block) + { + setExitBBIndex(i); + return; + } + } + assert (false && "The block was not found in the PRegion!"); +} + +/** + * Adds a printf to the end of the parallel region that prints the + * region ID and the work item ID. + * + * Useful for debugging control flow bugs. + */ +void +ParallelRegion::InjectRegionPrintF() +{ + llvm::Module *M = entryBB()->getParent()->getParent(); + +#if 0 + // it should reuse equal strings anyways + const char* FORMAT_STR_VAR = ".pocl.pRegion_debug_str"; + llvm::Value *stringArg = M->getGlobalVariable(FORMAT_STR_VAR); + if (stringArg == NULL) + { + IRBuilder<> builder(entryBB()); + stringArg = builder.CreateGlobalString("PR %d WI %u %u %u\n", FORMAT_STR_VAR); + } +#endif + + ConstantInt* pRID = ConstantInt::get(M->getContext(), APInt(32, pRegionId, 10)); + std::vector<Value*> params; + params.push_back(pRID); + params.push_back(LocalIDXLoad()); + params.push_back(LocalIDYLoad()); + params.push_back(LocalIDZLoad()); + + InjectPrintF(exitBB()->getTerminator(), "PR %d WI %u %u %u\n", params); + +} + +/** + * Adds a printf to the end of the parallel region that prints the + * hex contents of all named non-pointer variables. + * + * Useful for debugging data flow bugs. + */ +void +ParallelRegion::InjectVariablePrintouts() +{ + for (ParallelRegion::iterator i = begin(); + i != end(); ++i) + { + llvm::BasicBlock *bb = *i; + for (llvm::BasicBlock::iterator instr = bb->begin(); + instr != bb->end(); ++instr) + { + llvm::Instruction *instruction = instr; + if (isa<PointerType>(instruction->getType()) || + !instruction->hasName()) continue; + std::string name = instruction->getName().str(); + std::vector<Value*> args; + IRBuilder<> builder(exitBB()->getTerminator()); + args.push_back(builder.CreateGlobalString(name)); + args.push_back(instruction); + InjectPrintF(instruction->getParent()->getTerminator(), "variable %s == %x\n", args); + } + } +} + +/** + * Localizes all the loads to the the work-item identifiers. + * + * In case the code inside the region queries the WI id, it + * should not (re)use one that is loaded in another region, but + * one that is loaded in the same region. Otherwise, it ends + * up using the last id the previous PR work-item loop got. + * This caused problems in cases where the local id was stored + * to a temporary variable in an earlier region and that temp + * was reused later. + * + * The function scans for all loads from the local id variables + * and converts them to loads inside the parallel region. + */ +void +ParallelRegion::LocalizeIDLoads() +{ + /* The local id loads inside the parallel region. */ + llvm::Instruction* LocalIDXLoadInstr = LocalIDXLoad(); + llvm::Instruction* LocalIDYLoadInstr = LocalIDYLoad(); + llvm::Instruction* LocalIDZLoadInstr = LocalIDZLoad(); + llvm::Module *M = LocalIDXLoadInstr->getParent()->getParent()->getParent(); + llvm::Value *localIdZ = M->getNamedGlobal(POCL_LOCAL_ID_Z_GLOBAL); + llvm::Value *localIdY = M->getNamedGlobal(POCL_LOCAL_ID_Y_GLOBAL); + llvm::Value *localIdX = M->getNamedGlobal(POCL_LOCAL_ID_X_GLOBAL); + + assert (localIdZ != NULL && localIdY != NULL && localIdX != NULL && + "The local id globals were not created."); + + for (ParallelRegion::iterator i = begin(); + i != end(); ++i) + { + llvm::BasicBlock *bb = *i; + for (llvm::BasicBlock::iterator instrI = bb->begin(); + instrI != bb->end(); ++instrI) + { + llvm::Instruction *instr = instrI; + if (instr == LocalIDXLoadInstr || + instr == LocalIDYLoadInstr || + instr == LocalIDZLoadInstr) continue; + + /* Search all operands of the instruction. If any of them is + using a local id, replace it with the intra-PR load from the + id variable. */ + for (unsigned opr = 0; opr < instr->getNumOperands(); ++opr) + { + llvm::LoadInst *load = + dyn_cast<llvm::LoadInst>(instr->getOperand(opr)); + if (load == NULL) continue; + if (load == LocalIDXLoadInstr || + load == LocalIDYLoadInstr || + load == LocalIDZLoadInstr) continue; + + if (load->getPointerOperand() == localIdZ) + instr->setOperand(opr, LocalIDZLoadInstr); + if (load->getPointerOperand() == localIdY) + instr->setOperand(opr, LocalIDYLoadInstr); + if (load->getPointerOperand() == localIdX) + instr->setOperand(opr, LocalIDXLoadInstr); + } + } + } +} |