diff options
Diffstat (limited to 'src/core/cpu/kernel.h')
-rw-r--r-- | src/core/cpu/kernel.h | 325 |
1 files changed, 325 insertions, 0 deletions
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h new file mode 100644 index 0000000..ab4d1ac --- /dev/null +++ b/src/core/cpu/kernel.h @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/kernel.h + * \brief CPU kernel + */ + +#ifndef __CPU_KERNEL_H__ +#define __CPU_KERNEL_H__ + +#include "../deviceinterface.h" +#include <core/config.h> + +#include <llvm/ExecutionEngine/GenericValue.h> +#include <vector> +#include <string> + +#include <ucontext.h> +#include <pthread.h> +#include <stdint.h> + +namespace llvm +{ + class Function; +} + +namespace Coal +{ + +class CPUDevice; +class Kernel; +class KernelEvent; +class Image2D; +class Image3D; + +/** + * \brief CPU kernel + * + * This class holds passive information about a kernel (\c Coal::Kernel object + * and device on which it is run) and provides the \c callFunction() function. + * + * This function is described at the end of \ref llvm . + * + * \see Coal::CPUKernelWorkGroup + */ +class CPUKernel : public DeviceKernel +{ + public: + /** + * \brief Constructor + * \param device device on which the kernel will be run + * \param kernel \c Coal::Kernel object holding information about this + * kernel + * \param function \c llvm::Function to run + */ + CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function); + ~CPUKernel(); + + size_t workGroupSize(); + cl_ulong localMemSize() const; + cl_ulong privateMemSize() const; + size_t preferredWorkGroupSizeMultiple() const; + size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim, + size_t global_work_size) const; + + Kernel *kernel() const; /*!< \brief \c Coal::Kernel object this kernel will run */ + CPUDevice *device() const; /*!< \brief device on which the kernel will be run */ + + llvm::Function *function() const; /*!< \brief \c llvm::Function representing the kernel but <strong>not to be run</strong> */ + llvm::Function *callFunction(); /*!< \brief stub function used to run the kernel, see \ref llvm */ + + /** + * \brief Calculate where to place a value in an array + * + * This function is used to calculate where to place a value in an + * array given its size, properly aligning it. + * + * This function is called repeatedly to obtain the aligned position of + * each value that must be place in the array + * + * \code + * size_t array_len = 0, array_offset = 0; + * void *array; + * + * // First, get the array size given alignment constraints + * typeOffset(array_len, sizeof(int)); + * typeOffset(array_len, sizeof(float)); + * typeOffset(array_len, sizeof(void *)); + * + * // Then, allocate memory + * array = malloc(array_len) + * + * // Finally, place the arguments + * *(int *)((char *)array + typeOffset(array_offset, sizeof(int))) = 1337; + * *(float *)((char *)array + typeOffset(array_offset, sizeof(int))) = 3.1415f; + * *(void **)((char *)array + typeOffset(array_offset, sizeof(int))) = array; + * \endcode + * + * \param offset offset at which the value will be placed. This variable + * gets incremented by <tt>type_len + padding</tt>. + * \param type_len size in bytes of the value that will be stored + * \return offset at which the value will be stored (equal to \p offset + * before incrementation. + */ + static size_t typeOffset(size_t &offset, size_t type_len); + + private: + CPUDevice *p_device; + Kernel *p_kernel; + llvm::Function *p_function, *p_call_function; + pthread_mutex_t p_call_function_mutex; +}; + +class CPUKernelEvent; + +/** + * \brief CPU kernel work-group + * + * This class represent a bulk of work-items that will be run. It is the one + * to actually run the kernel of its elements. + * + * \see \ref llvm + * \nosubgrouping + */ +class CPUKernelWorkGroup +{ + public: + /** + * \brief Constructor + * \param kernel kernel to run + * \param event event containing information about the kernel run + * \param cpu_event CPU-specific information and cache about \p event + * \param work_group_index index of this work-group in the kernel + */ + CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event, + CPUKernelEvent *cpu_event, + const size_t *work_group_index); + ~CPUKernelWorkGroup(); + + /** + * \brief Build a structure of arguments + * + * As C doesn't support calling functions with variable arguments + * unknown at the compilation, this function builds the list of + * arguments in memory. This array will then be passed to a LLVM stub + * function reading it and passing its values to the actuel kernel. + * + * \see \ref llvm + * \param locals_to_free if this kernel takes \c __local arguments, they + * must be \c malloc()'ed for every work-group. + * They are placed in this vector to be + * \c free()'ed at the end of \c run(). + * \return address of a memory location containing the arguments + */ + void *callArgs(std::vector<void *> &locals_to_free); + + /** + * \brief Run the work-group + * + * This function is the core of CPU-acceleration. It runs the work-items + * of this work-group given the correct arguments. + * + * \see \ref llvm + * \see \ref barrier + * \see callArgs() + * \return true if success, false in case of an error + */ + bool run(); + + /** + * \name Native implementation of built-in OpenCL C functions + * @{ + */ + size_t getGlobalId(cl_uint dimindx) const; + cl_uint getWorkDim() const; + size_t getGlobalSize(cl_uint dimindx) const; + size_t getLocalSize(cl_uint dimindx) const; + size_t getLocalID(cl_uint dimindx) const; + size_t getNumGroups(cl_uint dimindx) const; + size_t getGroupID(cl_uint dimindx) const; + size_t getGlobalOffset(cl_uint dimindx) const; + + void barrier(unsigned int flags); + + void *getImageData(Image2D *image, int x, int y, int z) const; + + void writeImage(Image2D *image, int x, int y, int z, float *color) const; + void writeImage(Image2D *image, int x, int y, int z, int32_t *color) const; + void writeImage(Image2D *image, int x, int y, int z, uint32_t *color) const; + + void readImage(float *result, Image2D *image, int x, int y, int z, + uint32_t sampler) const; + void readImage(int32_t *result, Image2D *image, int x, int y, int z, + uint32_t sampler) const; + void readImage(uint32_t *result, Image2D *image, int x, int y, int z, + uint32_t sampler) const; + + void readImage(float *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + void readImage(int32_t *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + void readImage(uint32_t *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + /** + * @} + */ + + /** + * \brief Function called when a built-in name cannot be found + */ + void builtinNotFound(const std::string &name) const; + + private: + template<typename T> + void writeImageImpl(Image2D *image, int x, int y, int z, T *color) const; + template<typename T> + void readImageImplI(T *result, Image2D *image, int x, int y, int z, + uint32_t sampler) const; + template<typename T> + void readImageImplF(T *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + template<typename T> + void linear3D(T *result, float a, float b, float c, + int i0, int j0, int k0, int i1, int j1, int k1, + Image3D *image) const; + template<typename T> + void linear2D(T *result, float a, float b, float c, int i0, int j0, + int i1, int j1, Image2D *image) const; + + private: + CPUKernel *p_kernel; + CPUKernelEvent *p_cpu_event; + KernelEvent *p_event; + cl_uint p_work_dim; + size_t p_index[MAX_WORK_DIMS], + p_max_local_id[MAX_WORK_DIMS], + p_global_id_start_offset[MAX_WORK_DIMS]; + + void (*p_kernel_func_addr)(void *); + void *p_args; + + // Machinery to have barrier() working + struct Context + { + size_t local_id[MAX_WORK_DIMS]; + ucontext_t context; + unsigned int initialized; + }; + + Context *getContextAddr(unsigned int index); + + Context *p_current_context; + Context p_dummy_context; + void *p_contexts; + size_t p_stack_size; + unsigned int p_num_work_items, p_current_work_item; + bool p_had_barrier; +}; + +/** + * \brief CPU-specific information about a kernel event + * + * This class put in a \c Coal::KernelEvent device-data field + * (see \c Coal::Event::setDeviceData()) is responsible for dispatching the + * \c Coal::CPUKernelWorkGroup objects between the CPU worker threads. + */ +class CPUKernelEvent +{ + public: + /** + * \brief Constructor + * \param device device running the kernel + * \param event \c Coal::KernelEvent holding device-agnostic data + * about the event + */ + CPUKernelEvent(CPUDevice *device, KernelEvent *event); + ~CPUKernelEvent(); + + bool reserve(); /*!< \brief The next Work Group that will execute will be the last. Locks the event */ + bool finished(); /*!< \brief All the work groups have finished */ + CPUKernelWorkGroup *takeInstance(); /*!< \brief Must be called exactly one time after reserve(). Unlocks the event */ + + void *kernelArgs() const; /*!< \brief Return the cached kernel arguments */ + void cacheKernelArgs(void *args); /*!< \brief Cache pre-built kernel arguments */ + + void workGroupFinished(); /*!< \brief A work-group has just finished */ + + private: + CPUDevice *p_device; + KernelEvent *p_event; + size_t p_current_work_group[MAX_WORK_DIMS], + p_max_work_groups[MAX_WORK_DIMS]; + size_t p_current_wg, p_finished_wg, p_num_wg; + pthread_mutex_t p_mutex; + void *p_kernel_args; +}; + +} + +#endif |