summaryrefslogtreecommitdiff
path: root/parallel-libs
diff options
context:
space:
mode:
authorJason Henline <jhen@google.com>2016-09-13 19:28:02 +0000
committerJason Henline <jhen@google.com>2016-09-13 19:28:02 +0000
commitcdab89469b6e971c63a7930f7ec668c1e981383e (patch)
treeda102ee4afbaaa9f5f53e926c08c39b11f73e715 /parallel-libs
parentc96da093c4d83771baea7df823858373ec1f7d62 (diff)
[SE] Host platform implementation
Summary: This implementation does not currently support multiple concurrent streams, and it won't allow kernels to be launched with grids larger than one block or blocks larger than one thread. These limitations could be removed in the future by launching new threads on the host, but that is not done in this implementation. Reviewers: jlebar Subscribers: beanz, mgorny, jprice, parallel_libs-commits Differential Revision: https://reviews.llvm.org/D24473
Diffstat (limited to 'parallel-libs')
-rw-r--r--parallel-libs/streamexecutor/examples/CMakeLists.txt3
-rw-r--r--parallel-libs/streamexecutor/examples/CUDASaxpy.cpp1
-rw-r--r--parallel-libs/streamexecutor/examples/HostSaxpy.cpp94
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h18
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h8
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h56
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h151
-rw-r--r--parallel-libs/streamexecutor/lib/PlatformManager.cpp3
8 files changed, 329 insertions, 5 deletions
diff --git a/parallel-libs/streamexecutor/examples/CMakeLists.txt b/parallel-libs/streamexecutor/examples/CMakeLists.txt
index 1d09a545429..cb061d5ca96 100644
--- a/parallel-libs/streamexecutor/examples/CMakeLists.txt
+++ b/parallel-libs/streamexecutor/examples/CMakeLists.txt
@@ -1,2 +1,5 @@
add_executable(cuda_saxpy_example CUDASaxpy.cpp)
target_link_libraries(cuda_saxpy_example streamexecutor)
+
+add_executable(host_saxpy_example HostSaxpy.cpp)
+target_link_libraries(host_saxpy_example streamexecutor)
diff --git a/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp b/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp
index 5fb3dba26a7..0fce5ed046b 100644
--- a/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp
+++ b/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp
@@ -17,7 +17,6 @@
#include <algorithm>
#include <cassert>
-#include <cstdio>
#include <cstdlib>
#include <vector>
diff --git a/parallel-libs/streamexecutor/examples/HostSaxpy.cpp b/parallel-libs/streamexecutor/examples/HostSaxpy.cpp
new file mode 100644
index 00000000000..525c4453b01
--- /dev/null
+++ b/parallel-libs/streamexecutor/examples/HostSaxpy.cpp
@@ -0,0 +1,94 @@
+//===-- HostSaxpy.cpp - Example of host saxpy with StreamExecutor API -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains example code demonstrating the usage of the
+/// StreamExecutor API for a host platform.
+///
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <vector>
+
+#include "streamexecutor/StreamExecutor.h"
+
+void Saxpy(float A, float *X, float *Y, size_t N) {
+ for (size_t I = 0; I < N; ++I)
+ X[I] = A * X[I] + Y[I];
+}
+
+namespace __compilergen {
+using SaxpyKernel =
+ streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
+ streamexecutor::GlobalDeviceMemory<float>, size_t>;
+
+// Wrapper function converts argument addresses to arguments.
+void SaxpyWrapper(const void *const *ArgumentAddresses) {
+ Saxpy(*static_cast<const float *>(ArgumentAddresses[0]),
+ static_cast<float *>(const_cast<void *>(ArgumentAddresses[1])),
+ static_cast<float *>(const_cast<void *>(ArgumentAddresses[2])),
+ *static_cast<const size_t *>(ArgumentAddresses[3]));
+}
+
+// The wrapper function is what gets registered.
+static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
+ streamexecutor::MultiKernelLoaderSpec Spec;
+ Spec.addHostFunction("Saxpy", SaxpyWrapper);
+ return Spec;
+}();
+} // namespace __compilergen
+
+int main() {
+ namespace se = ::streamexecutor;
+ namespace cg = ::__compilergen;
+
+ // Create some host data.
+ float A = 42.0f;
+ std::vector<float> HostX = {0, 1, 2, 3};
+ std::vector<float> HostY = {4, 5, 6, 7};
+ size_t ArraySize = HostX.size();
+
+ // Get a device object.
+ se::Platform *Platform =
+ getOrDie(se::PlatformManager::getPlatformByName("host"));
+ if (Platform->getDeviceCount() == 0) {
+ return EXIT_FAILURE;
+ }
+ se::Device *Device = getOrDie(Platform->getDevice(0));
+
+ // Load the kernel onto the device.
+ cg::SaxpyKernel Kernel =
+ getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
+
+ se::RegisteredHostMemory<float> RegisteredX =
+ getOrDie(Device->registerHostMemory<float>(HostX));
+ se::RegisteredHostMemory<float> RegisteredY =
+ getOrDie(Device->registerHostMemory<float>(HostY));
+
+ // Allocate memory on the device.
+ se::GlobalDeviceMemory<float> X =
+ getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
+ se::GlobalDeviceMemory<float> Y =
+ getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
+
+ // Run operations on a stream.
+ se::Stream Stream = getOrDie(Device->createStream());
+ Stream.thenCopyH2D(RegisteredX, X)
+ .thenCopyH2D(RegisteredY, Y)
+ .thenLaunch(1, 1, Kernel, A, X, Y, ArraySize)
+ .thenCopyD2H(X, RegisteredX);
+ // Wait for the stream to complete.
+ se::dieIfError(Stream.blockHostUntilDone());
+
+ // Process output data in HostX.
+ std::vector<float> ExpectedX = {4, 47, 90, 133};
+ assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
+}
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h b/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h
index c4b6722caf6..caf6f1bdc4f 100644
--- a/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h
+++ b/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h
@@ -65,11 +65,13 @@
#define STREAMEXECUTOR_KERNELSPEC_H
#include <cassert>
+#include <functional>
#include <map>
#include <memory>
#include <string>
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
namespace streamexecutor {
@@ -199,6 +201,9 @@ private:
/// than doing it by hand.
class MultiKernelLoaderSpec {
public:
+ /// Type of functions used as host platform kernels.
+ using HostFunctionTy = std::function<void(const void **)>;
+
std::string getKernelName() const {
if (TheKernelName)
return *TheKernelName;
@@ -215,6 +220,7 @@ public:
bool hasOpenCLTextInMemory() const {
return TheOpenCLTextInMemorySpec != nullptr;
}
+ bool hasHostFunction() const { return HostFunction != nullptr; }
// Accessors for platform variant kernel load specifications.
//
@@ -233,6 +239,11 @@ public:
return *TheOpenCLTextInMemorySpec;
}
+ const HostFunctionTy &getHostFunction() const {
+ assert(hasHostFunction() && "getting spec that is not present");
+ return *HostFunction;
+ }
+
// Builder-pattern-like methods for use in initializing a
// MultiKernelLoaderSpec.
//
@@ -256,6 +267,12 @@ public:
MultiKernelLoaderSpec &addOpenCLTextInMemory(llvm::StringRef KernelName,
const char *OpenCLText);
+ MultiKernelLoaderSpec &addHostFunction(llvm::StringRef KernelName,
+ HostFunctionTy Function) {
+ HostFunction = llvm::make_unique<HostFunctionTy>(std::move(Function));
+ return *this;
+ }
+
private:
void setKernelName(llvm::StringRef KernelName);
@@ -263,6 +280,7 @@ private:
std::unique_ptr<CUDAPTXInMemorySpec> TheCUDAPTXInMemorySpec;
std::unique_ptr<CUDAFatbinInMemorySpec> TheCUDAFatbinInMemorySpec;
std::unique_ptr<OpenCLTextInMemorySpec> TheOpenCLTextInMemorySpec;
+ std::unique_ptr<HostFunctionTy> HostFunction;
};
} // namespace streamexecutor
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h b/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h
index cc1ae405bbb..d55680dd58e 100644
--- a/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h
+++ b/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h
@@ -149,10 +149,10 @@ public:
/// Similar to synchronousCopyD2H(const void *, size_t, void
/// *, size_t, size_t), but copies memory from one location in device memory
/// to another rather than from device to host.
- virtual Error synchronousCopyD2D(const void *DeviceDstHandle,
- size_t DstByteOffset,
- const void *DeviceSrcHandle,
- size_t SrcByteOffset, size_t ByteCount) {
+ virtual Error synchronousCopyD2D(const void *DeviceSrcHandle,
+ size_t SrcByteOffset,
+ const void *DeviceDstHandle,
+ size_t DstByteOffset, size_t ByteCount) {
return make_error("synchronousCopyD2D not implemented for platform " +
getName());
}
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h b/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h
new file mode 100644
index 00000000000..52ad1ead5da
--- /dev/null
+++ b/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h
@@ -0,0 +1,56 @@
+//===-- HostPlatform.h - Host platform subclass -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the HostPlatform class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
+#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
+
+#include "HostPlatformDevice.h"
+#include "streamexecutor/Device.h"
+#include "streamexecutor/Platform.h"
+
+#include "llvm/Support/Mutex.h"
+
+namespace streamexecutor {
+namespace host {
+
+/// Platform that performs work on the host rather than offloading to an
+/// accelerator.
+class HostPlatform : public Platform {
+public:
+ size_t getDeviceCount() const override { return 1; }
+
+ Expected<Device *> getDevice(size_t DeviceIndex) override {
+ if (DeviceIndex != 0) {
+ return make_error(
+ "Requested device index " + llvm::Twine(DeviceIndex) +
+ " from host platform which only supports device index 0");
+ }
+ llvm::sys::ScopedLock Lock(Mutex);
+ if (!TheDevice) {
+ ThePlatformDevice = llvm::make_unique<HostPlatformDevice>();
+ TheDevice = llvm::make_unique<Device>(ThePlatformDevice.get());
+ }
+ return TheDevice.get();
+ }
+
+private:
+ llvm::sys::Mutex Mutex;
+ std::unique_ptr<HostPlatformDevice> ThePlatformDevice;
+ std::unique_ptr<Device> TheDevice;
+};
+
+} // namespace host
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h b/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
new file mode 100644
index 00000000000..e51552d1d2d
--- /dev/null
+++ b/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
@@ -0,0 +1,151 @@
+//===-- HostPlatformDevice.h - HostPlatformDevice class ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the HostPlatformDevice class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
+#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
+
+#include <cstdlib>
+#include <cstring>
+
+#include "streamexecutor/PlatformDevice.h"
+
+namespace streamexecutor {
+namespace host {
+
+/// A concrete PlatformDevice subclass that performs its work on the host rather
+/// than offloading to an accelerator.
+class HostPlatformDevice : public PlatformDevice {
+public:
+ std::string getName() const override { return "host"; }
+
+ Expected<const void *>
+ createKernel(const MultiKernelLoaderSpec &Spec) override {
+ if (!Spec.hasHostFunction()) {
+ return make_error("no host implementation available for kernel " +
+ Spec.getKernelName());
+ }
+ return static_cast<const void *>(&Spec.getHostFunction());
+ }
+
+ Error destroyKernel(const void *Handle) override { return Error::success(); }
+
+ Expected<const void *> createStream() override {
+ // TODO(jhen): Do something with threads to allow multiple streams.
+ return this;
+ }
+
+ Error destroyStream(const void *Handle) override { return Error::success(); }
+
+ Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize,
+ GridDimensions GridSize, const void *PKernelHandle,
+ const PackedKernelArgumentArrayBase &ArgumentArray) override {
+ // TODO(jhen): Can we do something with BlockSize and GridSize?
+ if (!(BlockSize.X == 1 && BlockSize.Y == 1 && BlockSize.Z == 1)) {
+ return make_error(
+ "Block dimensions were (" + llvm::Twine(BlockSize.X) + "," +
+ llvm::Twine(BlockSize.Y) + "," + llvm::Twine(BlockSize.Z) +
+ "), but only size (1,1,1) is permitted for this platform");
+ }
+ if (!(GridSize.X == 1 && GridSize.Y == 1 && GridSize.Z == 1)) {
+ return make_error(
+ "Grid dimensions were (" + llvm::Twine(GridSize.X) + "," +
+ llvm::Twine(GridSize.Y) + "," + llvm::Twine(GridSize.Z) +
+ "), but only size (1,1,1) is permitted for this platform");
+ }
+
+ (*static_cast<const std::function<void(const void *const *)> *>(
+ PKernelHandle))(ArgumentArray.getAddresses());
+ return Error::success();
+ }
+
+ Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+ size_t SrcByteOffset, void *HostDst, size_t DstByteOffset,
+ size_t ByteCount) override {
+ std::memcpy(offset(HostDst, DstByteOffset),
+ offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+ return Error::success();
+ }
+
+ Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
+ size_t SrcByteOffset, const void *DeviceDstHandle,
+ size_t DstByteOffset, size_t ByteCount) override {
+ std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+ offset(HostSrc, SrcByteOffset), ByteCount);
+ return Error::success();
+ }
+
+ Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+ size_t SrcByteOffset, const void *DeviceDstHandle,
+ size_t DstByteOffset, size_t ByteCount) override {
+ std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+ offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+ return Error::success();
+ }
+
+ Error blockHostUntilDone(const void *PlatformStreamHandle) override {
+ // All host operations are synchronous anyway.
+ return Error::success();
+ }
+
+ Expected<void *> allocateDeviceMemory(size_t ByteCount) override {
+ return std::malloc(ByteCount);
+ }
+
+ Error freeDeviceMemory(const void *Handle) override {
+ std::free(const_cast<void *>(Handle));
+ return Error::success();
+ }
+
+ Error registerHostMemory(void *Memory, size_t ByteCount) override {
+ return Error::success();
+ }
+
+ Error unregisterHostMemory(const void *Memory) override {
+ return Error::success();
+ }
+
+ Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset,
+ void *HostDst, size_t DstByteOffset,
+ size_t ByteCount) override {
+ std::memcpy(offset(HostDst, DstByteOffset),
+ offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+ return Error::success();
+ }
+
+ Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
+ const void *DeviceDstHandle, size_t DstByteOffset,
+ size_t ByteCount) override {
+ std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+ offset(HostSrc, SrcByteOffset), ByteCount);
+ return Error::success();
+ }
+
+ Error synchronousCopyD2D(const void *DeviceSrcHandle, size_t SrcByteOffset,
+ const void *DeviceDstHandle, size_t DstByteOffset,
+ size_t ByteCount) override {
+ std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+ offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+ return Error::success();
+ }
+
+private:
+ static void *offset(const void *Base, size_t Offset) {
+ return const_cast<char *>(static_cast<const char *>(Base) + Offset);
+ }
+};
+
+} // namespace host
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
diff --git a/parallel-libs/streamexecutor/lib/PlatformManager.cpp b/parallel-libs/streamexecutor/lib/PlatformManager.cpp
index 9cae5b1ea4b..7304cca755c 100644
--- a/parallel-libs/streamexecutor/lib/PlatformManager.cpp
+++ b/parallel-libs/streamexecutor/lib/PlatformManager.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "streamexecutor/PlatformManager.h"
+#include "streamexecutor/platforms/host/HostPlatform.h"
namespace streamexecutor {
@@ -23,6 +24,8 @@ PlatformManager::PlatformManager() {
// appropriate code to include here.
// * Use static initialization tricks to have platform libraries register
// themselves when they are loaded.
+
+ PlatformsByName.emplace("host", llvm::make_unique<host::HostPlatform>());
}
Expected<Platform *> PlatformManager::getPlatformByName(llvm::StringRef Name) {