diff options
Diffstat (limited to 'src/core/dsp/program.cpp')
-rw-r--r-- | src/core/dsp/program.cpp | 633 |
1 files changed, 633 insertions, 0 deletions
diff --git a/src/core/dsp/program.cpp b/src/core/dsp/program.cpp new file mode 100644 index 0000000..6495ec9 --- /dev/null +++ b/src/core/dsp/program.cpp @@ -0,0 +1,633 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "program.h" +#include "device.h" +#include "kernel.h" + +#include "../program.h" + +#include <llvm/PassManager.h> +#include <llvm/Analysis/Passes.h> +#include <llvm/Analysis/Verifier.h> +#include <llvm/Transforms/Scalar.h> +#include <llvm/Transforms/IPO.h> +#include <llvm/Transforms/Utils/UnifyFunctionExitNodes.h> +#include <llvm/Support/raw_ostream.h> +#include <llvm/Bitcode/ReaderWriter.h> +#include "wga.h" + +#include <llvm/LinkAllPasses.h> +#include <WorkitemHandlerChooser.h> +#include <BreakConstantGEPs.h> +#include <Flatten.h> +#include <PHIsToAllocas.h> +#include <IsolateRegions.h> +#include <VariableUniformityAnalysis.h> +#include <ImplicitLoopBarriers.h> +#include <LoopBarriers.h> +#include <BarrierTailReplication.h> +#include <CanonicalizeBarriers.h> +#include <WorkItemAliasAnalysis.h> +#include <WorkitemReplication.h> +#include <WorkitemLoops.h> +#include <AllocasToEntry.h> +#include <Workgroup.h> +#include <TargetAddressSpaces.h> + +#include <string> +#include <iostream> +#include <fstream> +#include <sstream> +#include <stdio.h> +#include <stdlib.h> +#include <vector> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <elf.h> + +#include "genfile_cache.h" + +genfile_cache * genfile_cache::pInstance = 0; + +timespec getTime() +{ + struct timespec tp; + if (clock_gettime(CLOCK_MONOTONIC, &tp) != 0) + clock_gettime(CLOCK_REALTIME, &tp); + return tp; +} + +double ts_to_double(const timespec &t) + { return ((double)t.tv_nsec) /1000000000.0 + (double)t.tv_sec; } + +double tsdiff (const timespec& start, const timespec& end) + { return ts_to_double(end) - ts_to_double(start); } + + +using namespace Coal; + +DSPProgram::DSPProgram(DSPDevice *device, Program *program) +: DeviceProgram(), p_device(device), p_program(program), p_program_handle(-1), p_loaded(false), p_keep_files(false), + p_cache_kernels(true) +{ + char *keep = getenv("TI_OCL_KEEP_FILES"); + if (keep) p_keep_files = true; + + char *cache = getenv("TI_OCL_CACHE_KERNELS_OFF"); + if (cache) p_cache_kernels = false; +} + +DSPProgram::~DSPProgram() +{ + p_device->unload(p_program_handle); + if (!p_keep_files && !p_cache_kernels) unlink(p_outfile); +} + +DSPProgram::segment_list *segments; + +bool DSPProgram::load() +{ + segments = &p_segments_written; + + p_program_handle = p_device->load(p_outfile); + if (!p_program_handle) return false; + + segments = NULL; + p_loaded = true; + + char *debug_kernel = getenv("TI_OCL_DEBUG_KERNEL"); + + /*------------------------------------------------------------------------- + * ensure that the newly populated areas are not stale in device caches + *------------------------------------------------------------------------*/ + Msg_t msg; + int segNum = p_segments_written.size(); + + assert(segNum <= MAX_FLUSH_BUF_SIZE/2); + + msg.command = CACHEINV; + msg.u.k.flush.numBuffers = segNum; + msg.u.k.flush.num_mpaxs = 0; + for (int i=0; i < segNum; ++i) + { + msg.u.k.flush.buffers[2*i] = p_segments_written[i].ptr; + msg.u.k.flush.buffers[2*i+1] = p_segments_written[i].size; + + uint32_t flags = p_segments_written[i].flags & + (DLOAD_SF_executable | DLOAD_SF_writable); + + const char *seg_desc; + switch (flags) + { + case 0: seg_desc = "Read Only"; break; + case DLOAD_SF_executable: seg_desc = "Executable"; break; + case DLOAD_SF_writable: seg_desc = "Writable"; break; + default: seg_desc = "Writable & Executable"; break; + } + + if (debug_kernel) + printf("%s segment loaded to 0x%08x with size 0x%x\n", + seg_desc, p_segments_written[i].ptr, p_segments_written[i].size); + } + + /*------------------------------------------------------------------------- + * Send the command and wait for the ready response. + *------------------------------------------------------------------------*/ + p_device->mail_to(msg); + + /*------------------------------------------------------------------------- + * We do not wait here. The wait will be handled by the standard wait loop + * int the worker thread. + *------------------------------------------------------------------------*/ + return true; +} + +bool DSPProgram::is_loaded() const +{ + return p_loaded; +} + +bool DSPProgram::linkStdLib() const +{ + return false; +} + +const char* DSPProgram::outfile_name() const +{ + return p_outfile; +} + +DSPDevicePtr DSPProgram::data_page_ptr() +{ + DSPDevicePtr p; + + if (!is_loaded()) load(); + + DLOAD_get_static_base(p_device->dload_handle(), p_program_handle, &p); + return p; +} + +void DSPProgram::createOptimizationPasses(llvm::PassManager *manager, + bool optimize, bool hasBarrier) +{ + if (hasBarrier) + { + manager->add(new llvm::DominatorTree()); + manager->add(new pocl::WorkitemHandlerChooser()); + manager->add(new BreakConstantGEPs()); // from pocl + // add(new GenerateHeader()); // no need + manager->add(new pocl::Flatten()); + manager->add( llvm::createAlwaysInlinerPass()); + manager->add( llvm::createGlobalDCEPass()); + manager->add( llvm::createCFGSimplificationPass()); + manager->add( llvm::createLoopSimplifyPass()); + manager->add(new pocl::PHIsToAllocas()); + manager->add( llvm::createRegionInfoPass()); + manager->add(new pocl::IsolateRegions()); + manager->add(new pocl::VariableUniformityAnalysis()); // TODO + manager->add(new pocl::ImplicitLoopBarriers()); + manager->add(new pocl::LoopBarriers()); + manager->add(new pocl::BarrierTailReplication()); + manager->add(new pocl::CanonicalizeBarriers()); + manager->add(new pocl::IsolateRegions()); + manager->add(new pocl::WorkItemAliasAnalysis()); + // add(new pocl::WorkitemReplication()); // no need + manager->add(new pocl::WorkitemLoops()); + manager->add(new pocl::AllocasToEntry()); + // add(new pocl::Workgroup()); // no need + manager->add(new pocl::TargetAddressSpaces()); + } + + if (optimize) + { + /* + * Inspired by code from "The LLVM Compiler Infrastructure" + */ + manager->add(llvm::createDeadArgEliminationPass()); + manager->add(llvm::createInstructionCombiningPass()); + manager->add(llvm::createFunctionInliningPass()); + manager->add(llvm::createPruneEHPass()); // Remove dead EH info. + manager->add(llvm::createGlobalOptimizerPass()); + manager->add(llvm::createGlobalDCEPass()); // Remove dead functions. + manager->add(llvm::createArgumentPromotionPass()); + manager->add(llvm::createInstructionCombiningPass()); + manager->add(llvm::createJumpThreadingPass()); + + //ASW TODO maybe turn off re: pete. might gen bad xlator input + //manager->add(llvm::createScalarReplAggregatesPass()); + + manager->add(llvm::createFunctionAttrsPass()); // Add nocapture. + manager->add(llvm::createGlobalsModRefPass()); // IP alias analysis. + manager->add(llvm::createLICMPass()); // Hoist loop invariants. + manager->add(llvm::createGVNPass()); // Remove redundancies. + manager->add(llvm::createMemCpyOptPass()); // Remove dead memcpys. + manager->add(llvm::createDeadStoreEliminationPass()); + manager->add(llvm::createInstructionCombiningPass()); + manager->add(llvm::createJumpThreadingPass()); + manager->add(llvm::createCFGSimplificationPass()); + } + + manager->add(llvm::createUnifyFunctionExitNodesPass()); + manager->add(llvm::createTIOpenclWorkGroupAggregationPass(hasBarrier)); + + /*------------------------------------------------------------------------- + * Borrow the pocl alloca hoister for the TI simplistic WGA pass as well + *------------------------------------------------------------------------*/ + if (!hasBarrier) + manager->add(new pocl::AllocasToEntry()); +} + + +std::string process_cl6x_options(std::string options) +{ + std::istringstream options_stream(options); + std::string token; + std::string result; + + while (options_stream >> token) + { + if ((token.find(".obj") != std::string::npos) || + (token.find(".dll") != std::string::npos) || + (token.find(".ae66") != std::string::npos) || + (token.find(".a66") != std::string::npos) || + (token.find(".out") != std::string::npos) || + (token.find(".lib") != std::string::npos) || + (token.find(".o") != std::string::npos) || + (token.find(".o66") != std::string::npos) || + (token.find(".oe66") != std::string::npos) || + (token.find(".a") != std::string::npos) || + (token.find(".cmd") != std::string::npos)) + result += token + " "; + } + return result; +} + +/****************************************************************************** +* Find the C6000 CGT installation +******************************************************************************/ +char *get_cgt_install() +{ + char *install = getenv("TI_OCL_CGT_INSTALL"); + if (!install) + { + std::cout << + "The environment variable TI_OCL_CGT_INSTALL must be set to a " + << std::endl << + "directory path where the C6000 compiler tools are installed. " + << std::endl; + + abort(); + } + + return install; +} + +/****************************************************************************** +* Find the OpenCL installation +******************************************************************************/ +char *get_ocl_install() +{ + char *install = getenv("TI_OCL_INSTALL"); + if (!install) + { + std::cout << + "The environment variable TI_OCL_INSTALL must be set to a " + << std::endl << + "directory path where the TI OpenCL product is installed. " + << std::endl; + + abort(); + } + + return install; +} + +std::string get_ocl_dsp() +{ + static std::string sinstall; + + if (sinstall.empty()) + { + struct stat st; + const char *stdpath = "/usr/share/ti/opencl/dsp"; + if (stat(stdpath, &st) == 0) + sinstall = string(stdpath); + else sinstall = string(get_ocl_install()) + "/dsp"; + } + + return sinstall; +} + +/****************************************************************************** +* run_cl6x +******************************************************************************/ +static int run_cl6x(char *filename, std::string *llvm_bitcode, + bool keep_files, std::string options) +{ + std::string command("cl6x --f -q --abi=eabi --use_g3 -mv6600 -mt -mo " + "-ft=/tmp -fs=/tmp -fr=/tmp "); + + if (keep_files) command += "-mw -k --z "; + + /*------------------------------------------------------------------------- + * Turned off for now to workaround a timing bug. Plan to re-enable later + *------------------------------------------------------------------------*/ + command += "--disable:sploop "; + + char *cl6x_debug = getenv("TI_OCL_CL6X_DEBUG"); + + if (cl6x_debug) command += "-g -o0 "; + else command += "-o3 "; + + char *no_sp = getenv("TI_OCL_SOFTWARE_PIPELINE_OFF"); + if (no_sp) command += "-mu "; + + char *cgt_install = get_cgt_install(); + + command += "-I"; command += cgt_install; command += "/include "; + command += "-I"; command += cgt_install; command += "/lib "; + command += "-I"; command += get_ocl_dsp().c_str(); command += " "; + + command += "--bc_file="; command += filename; command += " "; + + /*------------------------------------------------------------------------- + * Encode LLVM bitcode as bytes in the .llvmir section of the .asm file + *------------------------------------------------------------------------*/ + if (llvm_bitcode != NULL) + { + char bitasm_name[32]; + strcpy(bitasm_name, filename); + strcat(bitasm_name, "_bc.asm"); + std::ofstream outasmfile(bitasm_name, std::ios::out); + outasmfile << "\t.sect \".llvmir\"\n" << "\t.retain"; + int nbytes = llvm_bitcode->size(); + for (int i = 0; i < nbytes; i++) + if (i % 10 == 0) + outasmfile << "\n\t.byte " << (int) llvm_bitcode->at(i); + else + outasmfile << ", " << (int) llvm_bitcode->at(i); + outasmfile.close(); + + command += bitasm_name; command += " "; + } + + command += "-z -ldsp.syms -o "; + command += filename; command += ".out "; + + if (keep_files) + { command += "-m "; command += filename; command += ".map "; } + + /*------------------------------------------------------------------------- + * Any libraries or object files need to go last to resolve references + *------------------------------------------------------------------------*/ + command += process_cl6x_options(options); + + //timespec t0, t1; + //clock_gettime(CLOCK_MONOTONIC, &t0); + int x = system(command.c_str()); + //clock_gettime(CLOCK_MONOTONIC, &t1); + //printf("cl6x time: %6.4f secs\n", + // (float)t1.tv_sec-t0.tv_sec+(t1.tv_nsec-t0.tv_nsec)/1e9); + + if (!cl6x_debug) + { + std::string strip_command("strip6x "); + strip_command += filename; strip_command += ".out"; + x = system(strip_command.c_str()); + } +} + +/** + * Extract llvm bitcode and native binary from MixedBinary + */ +bool DSPProgram::ExtractMixedBinary(std::string *binary_str, + std::string *bitcode, std::string *native) +{ + if (binary_str == NULL) return false; + if (strncmp(&binary_str->at(0), ELFMAG, SELFMAG) != 0) return false; + + /*------------------------------------------------------------------------- + * Parse ELF file format, extract ".llvmir" section into bitcode + * Valid Assumptions: 1. cl6x only creates 32-bit ELF files (for now) + * 2. cl6x ELF file has the same endianness as the host + *------------------------------------------------------------------------*/ + if (bitcode != NULL) + { + Elf32_Ehdr ehdr; /* memcpy into here to guarantee proper alignment */ + memcpy(&ehdr, & binary_str->at(0), sizeof(Elf32_Ehdr)); + int n_sects = ehdr.e_shnum; + int shoff = ehdr.e_shoff; + int shstr_sect = ehdr.e_shstrndx; + + Elf32_Shdr shdr; /* memcpy into here to guarantee proper alignment */ + int shsize = sizeof(Elf32_Shdr); + memcpy(&shdr, & binary_str->at(shoff + shstr_sect * shsize), shsize); + char *strtab = & binary_str->at(shdr.sh_offset); + + int i; + for (i = 0; i < n_sects; i++) + { + if (i == shstr_sect) continue; + memcpy(&shdr, & binary_str->at(shoff + i * shsize), shsize); + if (strcmp(&strtab[shdr.sh_name], ".llvmir") == 0) break; + } + if (i >= n_sects) return false; + + bitcode->clear(); + bitcode->append(& binary_str->at(shdr.sh_offset), shdr.sh_size); + } + + /*------------------------------------------------------------------------- + * Return the c6x ELF file in binary_str as native binary + *------------------------------------------------------------------------*/ + if (native != NULL) + { + native->clear(); + native->append(*binary_str); + } + + return true; +} + + +/** + * Write native binary into file, create tmporary filename in p_outfile + */ +void DSPProgram::WriteNativeOut(std::string *native) +{ + try + { + char name_out[] = "/tmp/openclXXXXXX"; + int fOutfile = mkstemp(name_out); + strcpy(p_outfile, name_out); + strcat(p_outfile, ".out"); + + std::ofstream outfile(p_outfile, std::ios::out | std::ios::binary); + outfile.write(native->data(), native->size()); + outfile.close(); + close(fOutfile); + } + catch(...) { std::cout << "ERROR: Binary write out failure" << std::endl; } +} + +/** + * Native binary is stored in file, filename in p_outfile + * Input: binary_str contains only the bitcode + * Output: binary_str contains c6x ELF file with bitcode in ".llvmir" section + */ +void DSPProgram::ReadEmbeddedBinary(std::string *binary_str) +{ + if (binary_str == NULL) return; + + int length; + char *buffer = NULL; + + try + { + std::ifstream is; + is.open(p_outfile, std::ios::binary); + is.seekg(0, std::ios::end); + length = is.tellg(); + is.seekg(0, std::ios::beg); + buffer = new char[length]; + is.read(buffer, length); + is.close(); + + binary_str->clear(); + binary_str->append(buffer, length); + delete [] buffer; + } + catch(...) { std::cout << "ERROR: Binary read in failure" << std::endl; } +} + +bool DSPProgram::build(llvm::Module *module, std::string *binary_str) +{ + p_module = module; + + /*------------------------------------------------------------------------ + * The input binary_str could be any of the following: + * 1. Mixed C6x binary embedded with LLVM bitcode, extract C6x native + * binary and return. There is no need to rebuild from LLVM module. + * 2. LLVM bitcode, proceed to the regular build: + * 2.1 return a corresponding cached c6x binary, if found + * 2.2 invoke c6x compiler toolchain, embed LLVM bitcode, build + * In either case, put c6x binary in binary_str when return + *------------------------------------------------------------------------*/ + std::string native; + if (ExtractMixedBinary(binary_str, NULL, &native)) + { + WriteNativeOut(&native); + return true; + } + + if (p_cache_kernels) + { + string cached_outfile = genfile_cache::instance()->lookup + (p_module, p_program->deviceDependentCompilerOptions(p_device)); + + if (!cached_outfile.empty()) + { + strcpy(p_outfile, cached_outfile.c_str()); + ReadEmbeddedBinary(binary_str); + return true; + } + } + + char name_template[] = "/tmp/openclXXXXXX"; + int pFile = mkstemp(name_template); + + strcpy(p_outfile, name_template); + strcat(p_outfile, ".out"); + + if (pFile != -1) + { + if (p_keep_files) + { + //write out the source as well + + std::string filename(name_template); + filename += ".cl"; + std::ofstream out(filename.c_str()); + out << p_program->source(); + out.close(); + } + + llvm::raw_fd_ostream ostream(pFile, false); + llvm::WriteBitcodeToFile(p_module, ostream); + ostream.flush(); + + run_cl6x(name_template, binary_str, p_keep_files, + p_program->deviceDependentCompilerOptions(p_device)); + + if (!p_keep_files) + { + unlink(name_template); + + char objfile[32]; + strcpy(objfile, name_template); + strcat(objfile, ".obj"); + unlink(objfile); + + if (binary_str != NULL) + { + strcpy(objfile, name_template); + strcat(objfile, "_bc.asm"); + unlink(objfile); + + strcpy(objfile, name_template); + strcat(objfile, "_bc.obj"); + unlink(objfile); + } + } + + if (p_cache_kernels) + genfile_cache::instance()->remember(p_outfile, p_module, + p_program->deviceDependentCompilerOptions(p_device)); + + ReadEmbeddedBinary(binary_str); + } + + if (pFile != -1) close(pFile); + + return true; +} + +DSPDevicePtr DSPProgram::query_symbol(const char *symname) +{ + DSPDevicePtr addr; + + bool found = DLOAD_query_symbol(p_device->dload_handle(), p_program_handle, + symname, &addr); + + return (found) ? addr : 0; +} + |