/* * Copyright (c) 2011, Denis Steckelmacher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the copyright holder nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * \file cpu/kernel.h * \brief CPU kernel */ #ifndef __CPU_KERNEL_H__ #define __CPU_KERNEL_H__ #include "../deviceinterface.h" #include #include #include #include #include #include #include namespace llvm { class Function; } namespace Coal { class CPUDevice; class Kernel; class KernelEvent; class Image2D; class Image3D; /** * \brief CPU kernel * * This class holds passive information about a kernel (\c Coal::Kernel object * and device on which it is run) and provides the \c callFunction() function. * * This function is described at the end of \ref llvm . * * \see Coal::CPUKernelWorkGroup */ class CPUKernel : public DeviceKernel { public: /** * \brief Constructor * \param device device on which the kernel will be run * \param kernel \c Coal::Kernel object holding information about this * kernel * \param function \c llvm::Function to run */ CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function); ~CPUKernel(); size_t workGroupSize(); cl_ulong localMemSize() const; cl_ulong privateMemSize() const; size_t preferredWorkGroupSizeMultiple() const; size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim, size_t global_work_size) const; Kernel *kernel() const; /*!< \brief \c Coal::Kernel object this kernel will run */ CPUDevice *device() const; /*!< \brief device on which the kernel will be run */ llvm::Function *function() const; /*!< \brief \c llvm::Function representing the kernel but not to be run */ llvm::Function *callFunction(); /*!< \brief stub function used to run the kernel, see \ref llvm */ /** * \brief Calculate where to place a value in an array * * This function is used to calculate where to place a value in an * array given its size, properly aligning it. * * This function is called repeatedly to obtain the aligned position of * each value that must be place in the array * * \code * size_t array_len = 0, array_offset = 0; * void *array; * * // First, get the array size given alignment constraints * typeOffset(array_len, sizeof(int)); * typeOffset(array_len, sizeof(float)); * typeOffset(array_len, sizeof(void *)); * * // Then, allocate memory * array = malloc(array_len) * * // Finally, place the arguments * *(int *)((char *)array + typeOffset(array_offset, sizeof(int))) = 1337; * *(float *)((char *)array + typeOffset(array_offset, sizeof(int))) = 3.1415f; * *(void **)((char *)array + typeOffset(array_offset, sizeof(int))) = array; * \endcode * * \param offset offset at which the value will be placed. This variable * gets incremented by type_len + padding. * \param type_len size in bytes of the value that will be stored * \return offset at which the value will be stored (equal to \p offset * before incrementation. */ static size_t typeOffset(size_t &offset, size_t type_len); private: CPUDevice *p_device; Kernel *p_kernel; llvm::Function *p_function, *p_call_function; pthread_mutex_t p_call_function_mutex; }; class CPUKernelEvent; /** * \brief CPU kernel work-group * * This class represent a bulk of work-items that will be run. It is the one * to actually run the kernel of its elements. * * \see \ref llvm * \nosubgrouping */ class CPUKernelWorkGroup { public: /** * \brief Constructor * \param kernel kernel to run * \param event event containing information about the kernel run * \param cpu_event CPU-specific information and cache about \p event * \param work_group_index index of this work-group in the kernel */ CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event, CPUKernelEvent *cpu_event, const size_t *work_group_index); ~CPUKernelWorkGroup(); /** * \brief Build a structure of arguments * * As C doesn't support calling functions with variable arguments * unknown at the compilation, this function builds the list of * arguments in memory. This array will then be passed to a LLVM stub * function reading it and passing its values to the actuel kernel. * * \see \ref llvm * \param locals_to_free if this kernel takes \c __local arguments, they * must be \c malloc()'ed for every work-group. * They are placed in this vector to be * \c free()'ed at the end of \c run(). * \return address of a memory location containing the arguments */ void *callArgs(std::vector &locals_to_free); /** * \brief Run the work-group * * This function is the core of CPU-acceleration. It runs the work-items * of this work-group given the correct arguments. * * \see \ref llvm * \see \ref barrier * \see callArgs() * \return true if success, false in case of an error */ bool run(); /** * \name Native implementation of built-in OpenCL C functions * @{ */ size_t getGlobalId(cl_uint dimindx) const; cl_uint getWorkDim() const; size_t getGlobalSize(cl_uint dimindx) const; size_t getLocalSize(cl_uint dimindx) const; size_t getLocalID(cl_uint dimindx) const; size_t getNumGroups(cl_uint dimindx) const; size_t getGroupID(cl_uint dimindx) const; size_t getGlobalOffset(cl_uint dimindx) const; void barrier(unsigned int flags); void *getImageData(Image2D *image, int x, int y, int z) const; void writeImage(Image2D *image, int x, int y, int z, float *color) const; void writeImage(Image2D *image, int x, int y, int z, int32_t *color) const; void writeImage(Image2D *image, int x, int y, int z, uint32_t *color) const; void readImage(float *result, Image2D *image, int x, int y, int z, uint32_t sampler) const; void readImage(int32_t *result, Image2D *image, int x, int y, int z, uint32_t sampler) const; void readImage(uint32_t *result, Image2D *image, int x, int y, int z, uint32_t sampler) const; void readImage(float *result, Image2D *image, float x, float y, float z, uint32_t sampler) const; void readImage(int32_t *result, Image2D *image, float x, float y, float z, uint32_t sampler) const; void readImage(uint32_t *result, Image2D *image, float x, float y, float z, uint32_t sampler) const; /** * @} */ /** * \brief Function called when a built-in name cannot be found */ void builtinNotFound(const std::string &name) const; private: template void writeImageImpl(Image2D *image, int x, int y, int z, T *color) const; template void readImageImplI(T *result, Image2D *image, int x, int y, int z, uint32_t sampler) const; template void readImageImplF(T *result, Image2D *image, float x, float y, float z, uint32_t sampler) const; template void linear3D(T *result, float a, float b, float c, int i0, int j0, int k0, int i1, int j1, int k1, Image3D *image) const; template void linear2D(T *result, float a, float b, float c, int i0, int j0, int i1, int j1, Image2D *image) const; private: CPUKernel *p_kernel; CPUKernelEvent *p_cpu_event; KernelEvent *p_event; cl_uint p_work_dim; size_t p_index[MAX_WORK_DIMS], p_max_local_id[MAX_WORK_DIMS], p_global_id_start_offset[MAX_WORK_DIMS]; void (*p_kernel_func_addr)(void *); void *p_args; // Machinery to have barrier() working struct Context { size_t local_id[MAX_WORK_DIMS]; ucontext_t context; unsigned int initialized; }; Context *getContextAddr(unsigned int index); Context *p_current_context; Context p_dummy_context; void *p_contexts; size_t p_stack_size; unsigned int p_num_work_items, p_current_work_item; bool p_had_barrier; }; /** * \brief CPU-specific information about a kernel event * * This class put in a \c Coal::KernelEvent device-data field * (see \c Coal::Event::setDeviceData()) is responsible for dispatching the * \c Coal::CPUKernelWorkGroup objects between the CPU worker threads. */ class CPUKernelEvent { public: /** * \brief Constructor * \param device device running the kernel * \param event \c Coal::KernelEvent holding device-agnostic data * about the event */ CPUKernelEvent(CPUDevice *device, KernelEvent *event); ~CPUKernelEvent(); bool reserve(); /*!< \brief The next Work Group that will execute will be the last. Locks the event */ bool finished(); /*!< \brief All the work groups have finished */ CPUKernelWorkGroup *takeInstance(); /*!< \brief Must be called exactly one time after reserve(). Unlocks the event */ void *kernelArgs() const; /*!< \brief Return the cached kernel arguments */ void cacheKernelArgs(void *args); /*!< \brief Cache pre-built kernel arguments */ void workGroupFinished(); /*!< \brief A work-group has just finished */ private: CPUDevice *p_device; KernelEvent *p_event; size_t p_current_work_group[MAX_WORK_DIMS], p_max_work_groups[MAX_WORK_DIMS]; size_t p_current_wg, p_finished_wg, p_num_wg; pthread_mutex_t p_mutex; void *p_kernel_args; }; } #endif