diff options
author | Jason Henline <jhen@google.com> | 2016-09-13 19:28:02 +0000 |
---|---|---|
committer | Jason Henline <jhen@google.com> | 2016-09-13 19:28:02 +0000 |
commit | cdab89469b6e971c63a7930f7ec668c1e981383e (patch) | |
tree | da102ee4afbaaa9f5f53e926c08c39b11f73e715 /parallel-libs | |
parent | c96da093c4d83771baea7df823858373ec1f7d62 (diff) |
[SE] Host platform implementation
Summary:
This implementation does not currently support multiple concurrent streams, and
it won't allow kernels to be launched with grids larger than one block or
blocks larger than one thread. These limitations could be removed in the future
by launching new threads on the host, but that is not done in this
implementation.
Reviewers: jlebar
Subscribers: beanz, mgorny, jprice, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D24473
Diffstat (limited to 'parallel-libs')
8 files changed, 329 insertions, 5 deletions
diff --git a/parallel-libs/streamexecutor/examples/CMakeLists.txt b/parallel-libs/streamexecutor/examples/CMakeLists.txt index 1d09a545429..cb061d5ca96 100644 --- a/parallel-libs/streamexecutor/examples/CMakeLists.txt +++ b/parallel-libs/streamexecutor/examples/CMakeLists.txt @@ -1,2 +1,5 @@ add_executable(cuda_saxpy_example CUDASaxpy.cpp) target_link_libraries(cuda_saxpy_example streamexecutor) + +add_executable(host_saxpy_example HostSaxpy.cpp) +target_link_libraries(host_saxpy_example streamexecutor) diff --git a/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp b/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp index 5fb3dba26a7..0fce5ed046b 100644 --- a/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp +++ b/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp @@ -17,7 +17,6 @@ #include <algorithm> #include <cassert> -#include <cstdio> #include <cstdlib> #include <vector> diff --git a/parallel-libs/streamexecutor/examples/HostSaxpy.cpp b/parallel-libs/streamexecutor/examples/HostSaxpy.cpp new file mode 100644 index 00000000000..525c4453b01 --- /dev/null +++ b/parallel-libs/streamexecutor/examples/HostSaxpy.cpp @@ -0,0 +1,94 @@ +//===-- HostSaxpy.cpp - Example of host saxpy with StreamExecutor API -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains example code demonstrating the usage of the +/// StreamExecutor API for a host platform. +/// +//===----------------------------------------------------------------------===// + +#include <algorithm> +#include <cassert> +#include <cstdio> +#include <vector> + +#include "streamexecutor/StreamExecutor.h" + +void Saxpy(float A, float *X, float *Y, size_t N) { + for (size_t I = 0; I < N; ++I) + X[I] = A * X[I] + Y[I]; +} + +namespace __compilergen { +using SaxpyKernel = + streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>, + streamexecutor::GlobalDeviceMemory<float>, size_t>; + +// Wrapper function converts argument addresses to arguments. +void SaxpyWrapper(const void *const *ArgumentAddresses) { + Saxpy(*static_cast<const float *>(ArgumentAddresses[0]), + static_cast<float *>(const_cast<void *>(ArgumentAddresses[1])), + static_cast<float *>(const_cast<void *>(ArgumentAddresses[2])), + *static_cast<const size_t *>(ArgumentAddresses[3])); +} + +// The wrapper function is what gets registered. +static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { + streamexecutor::MultiKernelLoaderSpec Spec; + Spec.addHostFunction("Saxpy", SaxpyWrapper); + return Spec; +}(); +} // namespace __compilergen + +int main() { + namespace se = ::streamexecutor; + namespace cg = ::__compilergen; + + // Create some host data. + float A = 42.0f; + std::vector<float> HostX = {0, 1, 2, 3}; + std::vector<float> HostY = {4, 5, 6, 7}; + size_t ArraySize = HostX.size(); + + // Get a device object. + se::Platform *Platform = + getOrDie(se::PlatformManager::getPlatformByName("host")); + if (Platform->getDeviceCount() == 0) { + return EXIT_FAILURE; + } + se::Device *Device = getOrDie(Platform->getDevice(0)); + + // Load the kernel onto the device. + cg::SaxpyKernel Kernel = + getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec)); + + se::RegisteredHostMemory<float> RegisteredX = + getOrDie(Device->registerHostMemory<float>(HostX)); + se::RegisteredHostMemory<float> RegisteredY = + getOrDie(Device->registerHostMemory<float>(HostY)); + + // Allocate memory on the device. + se::GlobalDeviceMemory<float> X = + getOrDie(Device->allocateDeviceMemory<float>(ArraySize)); + se::GlobalDeviceMemory<float> Y = + getOrDie(Device->allocateDeviceMemory<float>(ArraySize)); + + // Run operations on a stream. + se::Stream Stream = getOrDie(Device->createStream()); + Stream.thenCopyH2D(RegisteredX, X) + .thenCopyH2D(RegisteredY, Y) + .thenLaunch(1, 1, Kernel, A, X, Y, ArraySize) + .thenCopyD2H(X, RegisteredX); + // Wait for the stream to complete. + se::dieIfError(Stream.blockHostUntilDone()); + + // Process output data in HostX. + std::vector<float> ExpectedX = {4, 47, 90, 133}; + assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin())); +} diff --git a/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h b/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h index c4b6722caf6..caf6f1bdc4f 100644 --- a/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h +++ b/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h @@ -65,11 +65,13 @@ #define STREAMEXECUTOR_KERNELSPEC_H #include <cassert> +#include <functional> #include <map> #include <memory> #include <string> #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" namespace streamexecutor { @@ -199,6 +201,9 @@ private: /// than doing it by hand. class MultiKernelLoaderSpec { public: + /// Type of functions used as host platform kernels. + using HostFunctionTy = std::function<void(const void **)>; + std::string getKernelName() const { if (TheKernelName) return *TheKernelName; @@ -215,6 +220,7 @@ public: bool hasOpenCLTextInMemory() const { return TheOpenCLTextInMemorySpec != nullptr; } + bool hasHostFunction() const { return HostFunction != nullptr; } // Accessors for platform variant kernel load specifications. // @@ -233,6 +239,11 @@ public: return *TheOpenCLTextInMemorySpec; } + const HostFunctionTy &getHostFunction() const { + assert(hasHostFunction() && "getting spec that is not present"); + return *HostFunction; + } + // Builder-pattern-like methods for use in initializing a // MultiKernelLoaderSpec. // @@ -256,6 +267,12 @@ public: MultiKernelLoaderSpec &addOpenCLTextInMemory(llvm::StringRef KernelName, const char *OpenCLText); + MultiKernelLoaderSpec &addHostFunction(llvm::StringRef KernelName, + HostFunctionTy Function) { + HostFunction = llvm::make_unique<HostFunctionTy>(std::move(Function)); + return *this; + } + private: void setKernelName(llvm::StringRef KernelName); @@ -263,6 +280,7 @@ private: std::unique_ptr<CUDAPTXInMemorySpec> TheCUDAPTXInMemorySpec; std::unique_ptr<CUDAFatbinInMemorySpec> TheCUDAFatbinInMemorySpec; std::unique_ptr<OpenCLTextInMemorySpec> TheOpenCLTextInMemorySpec; + std::unique_ptr<HostFunctionTy> HostFunction; }; } // namespace streamexecutor diff --git a/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h b/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h index cc1ae405bbb..d55680dd58e 100644 --- a/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h +++ b/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h @@ -149,10 +149,10 @@ public: /// Similar to synchronousCopyD2H(const void *, size_t, void /// *, size_t, size_t), but copies memory from one location in device memory /// to another rather than from device to host. - virtual Error synchronousCopyD2D(const void *DeviceDstHandle, - size_t DstByteOffset, - const void *DeviceSrcHandle, - size_t SrcByteOffset, size_t ByteCount) { + virtual Error synchronousCopyD2D(const void *DeviceSrcHandle, + size_t SrcByteOffset, + const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) { return make_error("synchronousCopyD2D not implemented for platform " + getName()); } diff --git a/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h b/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h new file mode 100644 index 00000000000..52ad1ead5da --- /dev/null +++ b/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h @@ -0,0 +1,56 @@ +//===-- HostPlatform.h - Host platform subclass -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Declaration of the HostPlatform class. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H +#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H + +#include "HostPlatformDevice.h" +#include "streamexecutor/Device.h" +#include "streamexecutor/Platform.h" + +#include "llvm/Support/Mutex.h" + +namespace streamexecutor { +namespace host { + +/// Platform that performs work on the host rather than offloading to an +/// accelerator. +class HostPlatform : public Platform { +public: + size_t getDeviceCount() const override { return 1; } + + Expected<Device *> getDevice(size_t DeviceIndex) override { + if (DeviceIndex != 0) { + return make_error( + "Requested device index " + llvm::Twine(DeviceIndex) + + " from host platform which only supports device index 0"); + } + llvm::sys::ScopedLock Lock(Mutex); + if (!TheDevice) { + ThePlatformDevice = llvm::make_unique<HostPlatformDevice>(); + TheDevice = llvm::make_unique<Device>(ThePlatformDevice.get()); + } + return TheDevice.get(); + } + +private: + llvm::sys::Mutex Mutex; + std::unique_ptr<HostPlatformDevice> ThePlatformDevice; + std::unique_ptr<Device> TheDevice; +}; + +} // namespace host +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H diff --git a/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h b/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h new file mode 100644 index 00000000000..e51552d1d2d --- /dev/null +++ b/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h @@ -0,0 +1,151 @@ +//===-- HostPlatformDevice.h - HostPlatformDevice class ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Declaration of the HostPlatformDevice class. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H +#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H + +#include <cstdlib> +#include <cstring> + +#include "streamexecutor/PlatformDevice.h" + +namespace streamexecutor { +namespace host { + +/// A concrete PlatformDevice subclass that performs its work on the host rather +/// than offloading to an accelerator. +class HostPlatformDevice : public PlatformDevice { +public: + std::string getName() const override { return "host"; } + + Expected<const void *> + createKernel(const MultiKernelLoaderSpec &Spec) override { + if (!Spec.hasHostFunction()) { + return make_error("no host implementation available for kernel " + + Spec.getKernelName()); + } + return static_cast<const void *>(&Spec.getHostFunction()); + } + + Error destroyKernel(const void *Handle) override { return Error::success(); } + + Expected<const void *> createStream() override { + // TODO(jhen): Do something with threads to allow multiple streams. + return this; + } + + Error destroyStream(const void *Handle) override { return Error::success(); } + + Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize, + GridDimensions GridSize, const void *PKernelHandle, + const PackedKernelArgumentArrayBase &ArgumentArray) override { + // TODO(jhen): Can we do something with BlockSize and GridSize? + if (!(BlockSize.X == 1 && BlockSize.Y == 1 && BlockSize.Z == 1)) { + return make_error( + "Block dimensions were (" + llvm::Twine(BlockSize.X) + "," + + llvm::Twine(BlockSize.Y) + "," + llvm::Twine(BlockSize.Z) + + "), but only size (1,1,1) is permitted for this platform"); + } + if (!(GridSize.X == 1 && GridSize.Y == 1 && GridSize.Z == 1)) { + return make_error( + "Grid dimensions were (" + llvm::Twine(GridSize.X) + "," + + llvm::Twine(GridSize.Y) + "," + llvm::Twine(GridSize.Z) + + "), but only size (1,1,1) is permitted for this platform"); + } + + (*static_cast<const std::function<void(const void *const *)> *>( + PKernelHandle))(ArgumentArray.getAddresses()); + return Error::success(); + } + + Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle, + size_t SrcByteOffset, void *HostDst, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(HostDst, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc, + size_t SrcByteOffset, const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(HostSrc, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle, + size_t SrcByteOffset, const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error blockHostUntilDone(const void *PlatformStreamHandle) override { + // All host operations are synchronous anyway. + return Error::success(); + } + + Expected<void *> allocateDeviceMemory(size_t ByteCount) override { + return std::malloc(ByteCount); + } + + Error freeDeviceMemory(const void *Handle) override { + std::free(const_cast<void *>(Handle)); + return Error::success(); + } + + Error registerHostMemory(void *Memory, size_t ByteCount) override { + return Error::success(); + } + + Error unregisterHostMemory(const void *Memory) override { + return Error::success(); + } + + Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset, + void *HostDst, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(HostDst, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset, + const void *DeviceDstHandle, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(HostSrc, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error synchronousCopyD2D(const void *DeviceSrcHandle, size_t SrcByteOffset, + const void *DeviceDstHandle, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + +private: + static void *offset(const void *Base, size_t Offset) { + return const_cast<char *>(static_cast<const char *>(Base) + Offset); + } +}; + +} // namespace host +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H diff --git a/parallel-libs/streamexecutor/lib/PlatformManager.cpp b/parallel-libs/streamexecutor/lib/PlatformManager.cpp index 9cae5b1ea4b..7304cca755c 100644 --- a/parallel-libs/streamexecutor/lib/PlatformManager.cpp +++ b/parallel-libs/streamexecutor/lib/PlatformManager.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "streamexecutor/PlatformManager.h" +#include "streamexecutor/platforms/host/HostPlatform.h" namespace streamexecutor { @@ -23,6 +24,8 @@ PlatformManager::PlatformManager() { // appropriate code to include here. // * Use static initialization tricks to have platform libraries register // themselves when they are loaded. + + PlatformsByName.emplace("host", llvm::make_unique<host::HostPlatform>()); } Expected<Platform *> PlatformManager::getPlatformByName(llvm::StringRef Name) { |