1 files changed, 325 insertions, 0 deletions
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
new file mode 100644
index 0000000..ab4d1ac
--- /dev/null
+++ b/src/core/cpu/kernel.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the copyright holder nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/kernel.h
+ * \brief CPU kernel
+ */
+
+#ifndef __CPU_KERNEL_H__
+#define __CPU_KERNEL_H__
+
+#include "../deviceinterface.h"
+#include <core/config.h>
+
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <vector>
+#include <string>
+
+#include <ucontext.h>
+#include <pthread.h>
+#include <stdint.h>
+
+namespace llvm
+{
+    class Function;
+}
+
+namespace Coal
+{
+
+class CPUDevice;
+class Kernel;
+class KernelEvent;
+class Image2D;
+class Image3D;
+
+/**
+ * \brief CPU kernel
+ *
+ * This class holds passive information about a kernel (\c Coal::Kernel object
+ * and device on which it is run) and provides the \c callFunction() function.
+ *
+ * This function is described at the end of \ref llvm .
+ *
+ * \see Coal::CPUKernelWorkGroup
+ */
+class CPUKernel : public DeviceKernel
+{
+    public:
+        /**
+         * \brief Constructor
+         * \param device device on which the kernel will be run
+         * \param kernel \c Coal::Kernel object holding information about this
+         *               kernel
+         * \param function \c llvm::Function to run
+         */
+        CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function);
+        ~CPUKernel();
+
+        size_t workGroupSize();
+        cl_ulong localMemSize() const;
+        cl_ulong privateMemSize() const;
+        size_t preferredWorkGroupSizeMultiple() const;
+        size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
+                                  size_t global_work_size) const;
+
+        Kernel *kernel() const;     /*!< \brief \c Coal::Kernel object this kernel will run */
+        CPUDevice *device() const;  /*!< \brief device on which the kernel will be run */
+
+        llvm::Function *function() const;   /*!< \brief \c llvm::Function representing the kernel but <strong>not to be run</strong> */
+        llvm::Function *callFunction();     /*!< \brief stub function used to run the kernel, see \ref llvm */
+
+        /**
+         * \brief Calculate where to place a value in an array
+         *
+         * This function is used to calculate where to place a value in an
+         * array given its size, properly aligning it.
+         *
+         * This function is called repeatedly to obtain the aligned position of
+         * each value that must be place in the array
+         *
+         * \code
+         * size_t array_len = 0, array_offset = 0;
+         * void *array;
+         *
+         * // First, get the array size given alignment constraints
+         * typeOffset(array_len, sizeof(int));
+         * typeOffset(array_len, sizeof(float));
+         * typeOffset(array_len, sizeof(void *));
+         *
+         * // Then, allocate memory
+         * array = malloc(array_len)
+         *
+         * // Finally, place the arguments
+         * *(int *)((char *)array + typeOffset(array_offset, sizeof(int))) = 1337;
+         * *(float *)((char *)array + typeOffset(array_offset, sizeof(int))) = 3.1415f;
+         * *(void **)((char *)array + typeOffset(array_offset, sizeof(int))) = array;
+         * \endcode
+         *
+         * \param offset offset at which the value will be placed. This variable
+         *               gets incremented by <tt>type_len + padding</tt>.
+         * \param type_len size in bytes of the value that will be stored
+         * \return offset at which the value will be stored (equal to \p offset
+         *         before incrementation.
+         */
+        static size_t typeOffset(size_t &offset, size_t type_len);
+
+    private:
+        CPUDevice *p_device;
+        Kernel *p_kernel;
+        llvm::Function *p_function, *p_call_function;
+        pthread_mutex_t p_call_function_mutex;
+};
+
+class CPUKernelEvent;
+
+/**
+ * \brief CPU kernel work-group
+ *
+ * This class represent a bulk of work-items that will be run. It is the one
+ * to actually run the kernel of its elements.
+ *
+ * \see \ref llvm
+ * \nosubgrouping
+ */
+class CPUKernelWorkGroup
+{
+    public:
+        /**
+         * \brief Constructor
+         * \param kernel kernel to run
+         * \param event event containing information about the kernel run
+         * \param cpu_event CPU-specific information and cache about \p event
+         * \param work_group_index index of this work-group in the kernel
+         */
+        CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
+                           CPUKernelEvent *cpu_event,
+                           const size_t *work_group_index);
+        ~CPUKernelWorkGroup();
+
+        /**
+         * \brief Build a structure of arguments
+         *
+         * As C doesn't support calling functions with variable arguments
+         * unknown at the compilation, this function builds the list of
+         * arguments in memory. This array will then be passed to a LLVM stub
+         * function reading it and passing its values to the actuel kernel.
+         *
+         * \see \ref llvm
+         * \param locals_to_free if this kernel takes \c __local arguments, they
+         *                       must be \c malloc()'ed for every work-group.
+         *                       They are placed in this vector to be
+         *                       \c free()'ed at the end of \c run().
+         * \return address of a memory location containing the arguments
+         */
+        void *callArgs(std::vector<void *> &locals_to_free);
+
+        /**
+         * \brief Run the work-group
+         *
+         * This function is the core of CPU-acceleration. It runs the work-items
+         * of this work-group given the correct arguments.
+         *
+         * \see \ref llvm
+         * \see \ref barrier
+         * \see callArgs()
+         * \return true if success, false in case of an error
+         */
+        bool run();
+
+        /**
+         * \name Native implementation of built-in OpenCL C functions
+         * @{
+         */
+        size_t getGlobalId(cl_uint dimindx) const;
+        cl_uint getWorkDim() const;
+        size_t getGlobalSize(cl_uint dimindx) const;
+        size_t getLocalSize(cl_uint dimindx) const;
+        size_t getLocalID(cl_uint dimindx) const;
+        size_t getNumGroups(cl_uint dimindx) const;
+        size_t getGroupID(cl_uint dimindx) const;
+        size_t getGlobalOffset(cl_uint dimindx) const;
+
+        void barrier(unsigned int flags);
+
+        void *getImageData(Image2D *image, int x, int y, int z) const;
+
+        void writeImage(Image2D *image, int x, int y, int z, float *color) const;
+        void writeImage(Image2D *image, int x, int y, int z, int32_t *color) const;
+        void writeImage(Image2D *image, int x, int y, int z, uint32_t *color) const;
+
+        void readImage(float *result, Image2D *image, int x, int y, int z,
+                       uint32_t sampler) const;
+        void readImage(int32_t *result, Image2D *image, int x, int y, int z,
+                       uint32_t sampler) const;
+        void readImage(uint32_t *result, Image2D *image, int x, int y, int z,
+                       uint32_t sampler) const;
+
+        void readImage(float *result, Image2D *image, float x, float y, float z,
+                       uint32_t sampler) const;
+        void readImage(int32_t *result, Image2D *image, float x, float y, float z,
+                       uint32_t sampler) const;
+        void readImage(uint32_t *result, Image2D *image, float x, float y, float z,
+                       uint32_t sampler) const;
+        /**
+         * @}
+         */
+
+        /**
+         * \brief Function called when a built-in name cannot be found
+         */
+        void builtinNotFound(const std::string &name) const;
+
+    private:
+        template<typename T>
+        void writeImageImpl(Image2D *image, int x, int y, int z, T *color) const;
+        template<typename T>
+        void readImageImplI(T *result, Image2D *image, int x, int y, int z,
+                            uint32_t sampler) const;
+        template<typename T>
+        void readImageImplF(T *result, Image2D *image, float x, float y, float z,
+                            uint32_t sampler) const;
+        template<typename T>
+        void linear3D(T *result, float a, float b, float c,
+                       int i0, int j0, int k0, int i1, int j1, int k1,
+                       Image3D *image) const;
+        template<typename T>
+        void linear2D(T *result, float a, float b, float c, int i0, int j0,
+                      int i1, int j1, Image2D *image) const;
+
+    private:
+        CPUKernel *p_kernel;
+        CPUKernelEvent *p_cpu_event;
+        KernelEvent *p_event;
+        cl_uint p_work_dim;
+        size_t p_index[MAX_WORK_DIMS],
+               p_max_local_id[MAX_WORK_DIMS],
+               p_global_id_start_offset[MAX_WORK_DIMS];
+
+        void (*p_kernel_func_addr)(void *);
+        void *p_args;
+
+        // Machinery to have barrier() working
+        struct Context
+        {
+            size_t local_id[MAX_WORK_DIMS];
+            ucontext_t context;
+            unsigned int initialized;
+        };
+
+        Context *getContextAddr(unsigned int index);
+
+        Context *p_current_context;
+        Context p_dummy_context;
+        void *p_contexts;
+        size_t p_stack_size;
+        unsigned int p_num_work_items, p_current_work_item;
+        bool p_had_barrier;
+};
+
+/**
+ * \brief CPU-specific information about a kernel event
+ *
+ * This class put in a \c Coal::KernelEvent device-data field
+ * (see \c Coal::Event::setDeviceData()) is responsible for dispatching the
+ * \c Coal::CPUKernelWorkGroup objects between the CPU worker threads.
+ */
+class CPUKernelEvent
+{
+    public:
+        /**
+         * \brief Constructor
+         * \param device device running the kernel
+         * \param event \c Coal::KernelEvent holding device-agnostic data
+         *              about the event
+         */
+        CPUKernelEvent(CPUDevice *device, KernelEvent *event);
+        ~CPUKernelEvent();
+
+        bool reserve();  /*!< \brief The next Work Group that will execute will be the last. Locks the event */
+        bool finished(); /*!< \brief All the work groups have finished */
+        CPUKernelWorkGroup *takeInstance(); /*!< \brief Must be called exactly one time after reserve(). Unlocks the event */
+
+        void *kernelArgs() const;           /*!< \brief Return the cached kernel arguments */
+        void cacheKernelArgs(void *args);   /*!< \brief Cache pre-built kernel arguments */
+
+        void workGroupFinished();           /*!< \brief A work-group has just finished */
+
+    private:
+        CPUDevice *p_device;
+        KernelEvent *p_event;
+        size_t p_current_work_group[MAX_WORK_DIMS],
+               p_max_work_groups[MAX_WORK_DIMS];
+        size_t p_current_wg, p_finished_wg, p_num_wg;
+        pthread_mutex_t p_mutex;
+        void *p_kernel_args;
+};
+
+}
+
+#endif