aboutsummaryrefslogtreecommitdiff
path: root/parallel-libs
diff options
context:
space:
mode:
authorJason Henline <jhen@google.com>2016-09-14 19:58:34 +0000
committerJason Henline <jhen@google.com>2016-09-14 19:58:34 +0000
commit6bfc863d741a9419017a11060417aa0b71664f76 (patch)
tree42416854aaaf3d1a30d33ef6af03493f05f90565 /parallel-libs
parentd56a27e242d756457fb26cedfe665dde38893312 (diff)
[SE] Add CUDA platform
Summary: Basic CUDA platform implementation and cmake infrastructure to control whether it's used. A few important TODOs will be handled in later patches: * Log some error messages that can't easily be returned as Errors. * Cache modules and kernels to prevent reloading them if someone tries to reload a kernel that's already loaded. * Tolerate shared memory arguments for kernel launches. Reviewers: jlebar Subscribers: beanz, mgorny, jprice, jlebar, parallel_libs-commits Differential Revision: https://reviews.llvm.org/D24538 llvm-svn: 281524
Diffstat (limited to 'parallel-libs')
-rw-r--r--parallel-libs/streamexecutor/CMakeLists.txt5
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h40
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/PlatformOptions.h.in23
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatform.h42
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h93
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h2
-rw-r--r--parallel-libs/streamexecutor/lib/CMakeLists.txt24
-rw-r--r--parallel-libs/streamexecutor/lib/PlatformManager.cpp10
-rw-r--r--parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt3
-rw-r--r--parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt5
-rw-r--r--parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp65
-rw-r--r--parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp280
-rw-r--r--parallel-libs/streamexecutor/lib/platforms/cuda/cmake/modules/FindLibcuda.cmake21
13 files changed, 596 insertions, 17 deletions
diff --git a/parallel-libs/streamexecutor/CMakeLists.txt b/parallel-libs/streamexecutor/CMakeLists.txt
index 73934159cbf8..e5a2950438b0 100644
--- a/parallel-libs/streamexecutor/CMakeLists.txt
+++ b/parallel-libs/streamexecutor/CMakeLists.txt
@@ -3,9 +3,14 @@ cmake_minimum_required(VERSION 3.1)
option(STREAM_EXECUTOR_UNIT_TESTS "enable unit tests" ON)
option(STREAM_EXECUTOR_ENABLE_DOXYGEN "enable StreamExecutor doxygen" ON)
option(STREAM_EXECUTOR_ENABLE_CONFIG_TOOL "enable building streamexecutor-config tool" ON)
+option(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM "enable building the CUDA StreamExecutor platform" OFF)
+
+configure_file("include/streamexecutor/PlatformOptions.h.in" "include/streamexecutor/PlatformOptions.h")
# First find includes relative to the streamexecutor top-level source path.
include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+# Also look for configured headers in the top-level binary directory.
+include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/include)
# If we are not building as part of LLVM, build StreamExecutor as a standalone
# project using LLVM as an external library:
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h b/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h
index d55680dd58e4..5b10e7067b12 100644
--- a/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h
+++ b/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h
@@ -37,25 +37,29 @@ public:
virtual std::string getName() const = 0;
+ virtual std::string getPlatformName() const = 0;
+
/// Creates a platform-specific kernel.
virtual Expected<const void *>
createKernel(const MultiKernelLoaderSpec &Spec) {
- return make_error("createKernel not implemented for platform " + getName());
+ return make_error("createKernel not implemented for platform " +
+ getPlatformName());
}
virtual Error destroyKernel(const void *Handle) {
return make_error("destroyKernel not implemented for platform " +
- getName());
+ getPlatformName());
}
/// Creates a platform-specific stream.
virtual Expected<const void *> createStream() {
- return make_error("createStream not implemented for platform " + getName());
+ return make_error("createStream not implemented for platform " +
+ getPlatformName());
}
virtual Error destroyStream(const void *Handle) {
return make_error("destroyStream not implemented for platform " +
- getName());
+ getPlatformName());
}
/// Launches a kernel on the given stream.
@@ -63,7 +67,8 @@ public:
BlockDimensions BlockSize, GridDimensions GridSize,
const void *PKernelHandle,
const PackedKernelArgumentArrayBase &ArgumentArray) {
- return make_error("launch not implemented for platform " + getName());
+ return make_error("launch not implemented for platform " +
+ getPlatformName());
}
/// Copies data from the device to the host.
@@ -72,7 +77,8 @@ public:
virtual Error copyD2H(const void *PlatformStreamHandle,
const void *DeviceSrcHandle, size_t SrcByteOffset,
void *HostDst, size_t DstByteOffset, size_t ByteCount) {
- return make_error("copyD2H not implemented for platform " + getName());
+ return make_error("copyD2H not implemented for platform " +
+ getPlatformName());
}
/// Copies data from the host to the device.
@@ -81,7 +87,8 @@ public:
virtual Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
size_t SrcByteOffset, const void *DeviceDstHandle,
size_t DstByteOffset, size_t ByteCount) {
- return make_error("copyH2D not implemented for platform " + getName());
+ return make_error("copyH2D not implemented for platform " +
+ getPlatformName());
}
/// Copies data from one device location to another.
@@ -89,39 +96,40 @@ public:
const void *DeviceSrcHandle, size_t SrcByteOffset,
const void *DeviceDstHandle, size_t DstByteOffset,
size_t ByteCount) {
- return make_error("copyD2D not implemented for platform " + getName());
+ return make_error("copyD2D not implemented for platform " +
+ getPlatformName());
}
/// Blocks the host until the given stream completes all the work enqueued up
/// to the point this function is called.
virtual Error blockHostUntilDone(const void *PlatformStreamHandle) {
return make_error("blockHostUntilDone not implemented for platform " +
- getName());
+ getPlatformName());
}
/// Allocates untyped device memory of a given size in bytes.
virtual Expected<void *> allocateDeviceMemory(size_t ByteCount) {
return make_error("allocateDeviceMemory not implemented for platform " +
- getName());
+ getPlatformName());
}
/// Frees device memory previously allocated by allocateDeviceMemory.
virtual Error freeDeviceMemory(const void *Handle) {
return make_error("freeDeviceMemory not implemented for platform " +
- getName());
+ getPlatformName());
}
/// Registers previously allocated host memory so it can be used with copyH2D
/// and copyD2H.
virtual Error registerHostMemory(void *Memory, size_t ByteCount) {
return make_error("registerHostMemory not implemented for platform " +
- getName());
+ getPlatformName());
}
/// Unregisters host memory previously registered with registerHostMemory.
virtual Error unregisterHostMemory(const void *Memory) {
return make_error("unregisterHostMemory not implemented for platform " +
- getName());
+ getPlatformName());
}
/// Copies the given number of bytes from device memory to host memory.
@@ -133,7 +141,7 @@ public:
size_t SrcByteOffset, void *HostDst,
size_t DstByteOffset, size_t ByteCount) {
return make_error("synchronousCopyD2H not implemented for platform " +
- getName());
+ getPlatformName());
}
/// Similar to synchronousCopyD2H(const void *, size_t, void
@@ -143,7 +151,7 @@ public:
const void *DeviceDstHandle,
size_t DstByteOffset, size_t ByteCount) {
return make_error("synchronousCopyH2D not implemented for platform " +
- getName());
+ getPlatformName());
}
/// Similar to synchronousCopyD2H(const void *, size_t, void
@@ -154,7 +162,7 @@ public:
const void *DeviceDstHandle,
size_t DstByteOffset, size_t ByteCount) {
return make_error("synchronousCopyD2D not implemented for platform " +
- getName());
+ getPlatformName());
}
};
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/PlatformOptions.h.in b/parallel-libs/streamexecutor/include/streamexecutor/PlatformOptions.h.in
new file mode 100644
index 000000000000..2934dd428ea4
--- /dev/null
+++ b/parallel-libs/streamexecutor/include/streamexecutor/PlatformOptions.h.in
@@ -0,0 +1,23 @@
+//===-- PlatformOptions.h - Platform option macros --------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This contents of this file are filled in at configuration time. This file
+/// defines macros that represent the platform configuration state of the build,
+/// e.g. which platforms are enabled.
+///
+//===----------------------------------------------------------------------===//
+
+
+#ifndef STREAMEXECUTOR_PLATFORMOPTIONS_H
+#define STREAMEXECUTOR_PLATFORMOPTIONS_H
+
+#cmakedefine STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
+
+#endif // STREAMEXECUTOR_PLATFORMOPTIONS_H
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatform.h b/parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatform.h
new file mode 100644
index 000000000000..cbcd29af8194
--- /dev/null
+++ b/parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatform.h
@@ -0,0 +1,42 @@
+//===-- CUDAPlatform.h - CUDA platform subclass -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the CUDAPlatform class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
+#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
+
+#include "streamexecutor/Platform.h"
+#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
+
+#include "llvm/Support/Mutex.h"
+
+#include <map>
+
+namespace streamexecutor {
+namespace cuda {
+
+class CUDAPlatform : public Platform {
+public:
+ size_t getDeviceCount() const override;
+
+ Expected<Device> getDevice(size_t DeviceIndex) override;
+
+private:
+ llvm::sys::Mutex Mutex;
+ std::map<size_t, CUDAPlatformDevice> PlatformDevices;
+};
+
+} // namespace cuda
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h b/parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h
new file mode 100644
index 000000000000..b7c329851369
--- /dev/null
+++ b/parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h
@@ -0,0 +1,93 @@
+//===-- CUDAPlatformDevice.h - CUDAPlatformDevice class ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the CUDAPlatformDevice class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
+#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
+
+#include "streamexecutor/PlatformDevice.h"
+
+namespace streamexecutor {
+namespace cuda {
+
+Error CUresultToError(int CUResult, const llvm::Twine &Message);
+
+class CUDAPlatformDevice : public PlatformDevice {
+public:
+ static Expected<CUDAPlatformDevice> create(size_t DeviceIndex);
+
+ CUDAPlatformDevice(const CUDAPlatformDevice &) = delete;
+ CUDAPlatformDevice &operator=(const CUDAPlatformDevice &) = delete;
+
+ CUDAPlatformDevice(CUDAPlatformDevice &&) noexcept;
+ CUDAPlatformDevice &operator=(CUDAPlatformDevice &&) noexcept;
+
+ ~CUDAPlatformDevice() override;
+
+ std::string getName() const override;
+
+ std::string getPlatformName() const override { return "CUDA"; }
+
+ Expected<const void *>
+ createKernel(const MultiKernelLoaderSpec &Spec) override;
+ Error destroyKernel(const void *Handle) override;
+
+ Expected<const void *> createStream() override;
+ Error destroyStream(const void *Handle) override;
+
+ Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize,
+ GridDimensions GridSize, const void *PKernelHandle,
+ const PackedKernelArgumentArrayBase &ArgumentArray) override;
+
+ Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+ size_t SrcByteOffset, void *HostDst, size_t DstByteOffset,
+ size_t ByteCount) override;
+
+ Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
+ size_t SrcByteOffset, const void *DeviceDstHandle,
+ size_t DstByteOffset, size_t ByteCount) override;
+
+ Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+ size_t SrcByteOffset, const void *DeviceDstHandle,
+ size_t DstByteOffset, size_t ByteCount) override;
+
+ Error blockHostUntilDone(const void *PlatformStreamHandle) override;
+
+ Expected<void *> allocateDeviceMemory(size_t ByteCount) override;
+ Error freeDeviceMemory(const void *Handle) override;
+
+ Error registerHostMemory(void *Memory, size_t ByteCount) override;
+ Error unregisterHostMemory(const void *Memory) override;
+
+ Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset,
+ void *HostDst, size_t DstByteOffset,
+ size_t ByteCount) override;
+
+ Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
+ const void *DeviceDstHandle, size_t DstByteOffset,
+ size_t ByteCount) override;
+
+ Error synchronousCopyD2D(const void *DeviceDstHandle, size_t DstByteOffset,
+ const void *DeviceSrcHandle, size_t SrcByteOffset,
+ size_t ByteCount) override;
+
+private:
+ CUDAPlatformDevice(size_t DeviceIndex) : DeviceIndex(DeviceIndex) {}
+
+ int DeviceIndex;
+};
+
+} // namespace cuda
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h b/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
index 9331fb0b88df..d66557564503 100644
--- a/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
+++ b/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
@@ -29,6 +29,8 @@ class HostPlatformDevice : public PlatformDevice {
public:
std::string getName() const override { return "host"; }
+ std::string getPlatformName() const override { return "host"; }
+
Expected<const void *>
createKernel(const MultiKernelLoaderSpec &Spec) override {
if (!Spec.hasHostFunction()) {
diff --git a/parallel-libs/streamexecutor/lib/CMakeLists.txt b/parallel-libs/streamexecutor/lib/CMakeLists.txt
index fb3c04827626..4209c78ab6ff 100644
--- a/parallel-libs/streamexecutor/lib/CMakeLists.txt
+++ b/parallel-libs/streamexecutor/lib/CMakeLists.txt
@@ -3,6 +3,26 @@ macro(add_se_library name)
set_target_properties(${name} PROPERTIES FOLDER "streamexecutor libraries")
endmacro(add_se_library)
+if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
+ set(
+ CMAKE_MODULE_PATH
+ ${CMAKE_MODULE_PATH}
+ "${CMAKE_CURRENT_SOURCE_DIR}/platforms/cuda/cmake/modules/")
+
+ find_package(Libcuda REQUIRED)
+ include_directories(${LIBCUDA_INCLUDE_DIRS})
+
+ set(
+ STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT
+ $<TARGET_OBJECTS:streamexecutor_cuda_platform>)
+
+ set(
+ STREAM_EXECUTOR_LIBCUDA_LIBRARIES
+ ${LIBCUDA_LIBRARIES})
+endif(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
+
+add_subdirectory(platforms)
+
add_se_library(
streamexecutor
Device.cpp
@@ -16,6 +36,8 @@ add_se_library(
PlatformDevice.cpp
PlatformManager.cpp
Stream.cpp
- )
+ ${STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT}
+ LINK_LIBS
+ ${STREAM_EXECUTOR_LIBCUDA_LIBRARIES})
install(TARGETS streamexecutor DESTINATION lib)
diff --git a/parallel-libs/streamexecutor/lib/PlatformManager.cpp b/parallel-libs/streamexecutor/lib/PlatformManager.cpp
index 7304cca755c4..8f44befbd718 100644
--- a/parallel-libs/streamexecutor/lib/PlatformManager.cpp
+++ b/parallel-libs/streamexecutor/lib/PlatformManager.cpp
@@ -13,8 +13,14 @@
//===----------------------------------------------------------------------===//
#include "streamexecutor/PlatformManager.h"
+
+#include "streamexecutor/PlatformOptions.h"
#include "streamexecutor/platforms/host/HostPlatform.h"
+#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
+#include "streamexecutor/platforms/cuda/CUDAPlatform.h"
+#endif
+
namespace streamexecutor {
PlatformManager::PlatformManager() {
@@ -26,6 +32,10 @@ PlatformManager::PlatformManager() {
// themselves when they are loaded.
PlatformsByName.emplace("host", llvm::make_unique<host::HostPlatform>());
+
+#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
+ PlatformsByName.emplace("cuda", llvm::make_unique<cuda::CUDAPlatform>());
+#endif
}
Expected<Platform *> PlatformManager::getPlatformByName(llvm::StringRef Name) {
diff --git a/parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt b/parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt
new file mode 100644
index 000000000000..0802c059addc
--- /dev/null
+++ b/parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
+ add_subdirectory(cuda)
+endif()
diff --git a/parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt b/parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt
new file mode 100644
index 000000000000..5be76d1c75aa
--- /dev/null
+++ b/parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_library(
+ streamexecutor_cuda_platform
+ OBJECT
+ CUDAPlatform.cpp
+ CUDAPlatformDevice.cpp)
diff --git a/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp b/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp
new file mode 100644
index 000000000000..9f9e43886477
--- /dev/null
+++ b/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp
@@ -0,0 +1,65 @@
+//===-- CUDAPlatform.cpp - CUDA platform implementation -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Implementation of CUDA platform internals.
+///
+//===----------------------------------------------------------------------===//
+
+#include "streamexecutor/platforms/cuda/CUDAPlatform.h"
+#include "streamexecutor/Device.h"
+#include "streamexecutor/Platform.h"
+#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
+
+#include "llvm/Support/Mutex.h"
+
+#include "cuda.h"
+
+#include <map>
+
+namespace streamexecutor {
+namespace cuda {
+
+static CUresult ensureCUDAInitialized() {
+ static CUresult InitResult = []() { return cuInit(0); }();
+ return InitResult;
+}
+
+size_t CUDAPlatform::getDeviceCount() const {
+ if (ensureCUDAInitialized())
+ // TODO(jhen): Log an error.
+ return 0;
+
+ int DeviceCount = 0;
+ CUresult Result = cuDeviceGetCount(&DeviceCount);
+ (void)Result;
+ // TODO(jhen): Log an error.
+
+ return DeviceCount;
+}
+
+Expected<Device> CUDAPlatform::getDevice(size_t DeviceIndex) {
+ if (CUresult InitResult = ensureCUDAInitialized())
+ return CUresultToError(InitResult, "cached cuInit return value");
+
+ llvm::sys::ScopedLock Lock(Mutex);
+ auto Iterator = PlatformDevices.find(DeviceIndex);
+ if (Iterator == PlatformDevices.end()) {
+ if (auto MaybePDevice = CUDAPlatformDevice::create(DeviceIndex)) {
+ Iterator =
+ PlatformDevices.emplace(DeviceIndex, std::move(*MaybePDevice)).first;
+ } else {
+ return MaybePDevice.takeError();
+ }
+ }
+ return Device(&Iterator->second);
+}
+
+} // namespace cuda
+} // namespace streamexecutor
diff --git a/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp b/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp
new file mode 100644
index 000000000000..96aeafa0d719
--- /dev/null
+++ b/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp
@@ -0,0 +1,280 @@
+//===-- CUDAPlatformDevice.cpp - CUDAPlatformDevice implementation --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Implementation of CUDAPlatformDevice.
+///
+//===----------------------------------------------------------------------===//
+
+#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
+#include "streamexecutor/PlatformDevice.h"
+
+#include "cuda.h"
+
+namespace streamexecutor {
+namespace cuda {
+
+static void *offset(const void *Base, size_t Offset) {
+ return const_cast<char *>(static_cast<const char *>(Base) + Offset);
+}
+
+Error CUresultToError(int CUResult, const llvm::Twine &Message) {
+ CUresult Result = static_cast<CUresult>(CUResult);
+ if (Result) {
+ const char *ErrorName;
+ if (cuGetErrorName(Result, &ErrorName))
+ ErrorName = "UNKNOWN ERROR NAME";
+ const char *ErrorString;
+ if (cuGetErrorString(Result, &ErrorString))
+ ErrorString = "UNKNOWN ERROR DESCRIPTION";
+ return make_error("CUDA driver error: '" + Message + "', error code = " +
+ llvm::Twine(static_cast<int>(Result)) + ", name = " +
+ ErrorName + ", description = '" + ErrorString + "'");
+ } else
+ return Error::success();
+}
+
+std::string CUDAPlatformDevice::getName() const {
+ static std::string CachedName = [](int DeviceIndex) {
+ static constexpr size_t MAX_DRIVER_NAME_BYTES = 1024;
+ std::string Name = "CUDA device " + std::to_string(DeviceIndex);
+ char NameFromDriver[MAX_DRIVER_NAME_BYTES];
+ if (!cuDeviceGetName(NameFromDriver, MAX_DRIVER_NAME_BYTES - 1,
+ DeviceIndex)) {
+ NameFromDriver[MAX_DRIVER_NAME_BYTES - 1] = '\0';
+ Name.append(": ").append(NameFromDriver);
+ }
+ return Name;
+ }(DeviceIndex);
+ return CachedName;
+}
+
+Expected<CUDAPlatformDevice> CUDAPlatformDevice::create(size_t DeviceIndex) {
+ CUdevice DeviceHandle;
+ if (CUresult Result = cuDeviceGet(&DeviceHandle, DeviceIndex))
+ return CUresultToError(Result, "cuDeviceGet");
+
+ CUcontext ContextHandle;
+ if (CUresult Result = cuDevicePrimaryCtxRetain(&ContextHandle, DeviceHandle))
+ return CUresultToError(Result, "cuDevicePrimaryCtxRetain");
+
+ if (CUresult Result = cuCtxSetCurrent(ContextHandle))
+ return CUresultToError(Result, "cuCtxSetCurrent");
+
+ return CUDAPlatformDevice(DeviceIndex);
+}
+
+CUDAPlatformDevice::CUDAPlatformDevice(CUDAPlatformDevice &&Other) noexcept
+ : DeviceIndex(Other.DeviceIndex) {
+ Other.DeviceIndex = -1;
+}
+
+CUDAPlatformDevice &CUDAPlatformDevice::
+operator=(CUDAPlatformDevice &&Other) noexcept {
+ DeviceIndex = Other.DeviceIndex;
+ Other.DeviceIndex = -1;
+ return *this;
+}
+
+CUDAPlatformDevice::~CUDAPlatformDevice() {
+ CUresult Result = cuDevicePrimaryCtxRelease(DeviceIndex);
+ (void)Result;
+ // TODO(jhen): Log error.
+}
+
+Expected<const void *>
+CUDAPlatformDevice::createKernel(const MultiKernelLoaderSpec &Spec) {
+ // TODO(jhen): Maybe first check loaded modules?
+ if (!Spec.hasCUDAPTXInMemory())
+ return make_error("no CUDA code available to create kernel");
+
+ CUdevice Device = static_cast<int>(DeviceIndex);
+ int ComputeCapabilityMajor = 0;
+ int ComputeCapabilityMinor = 0;
+ if (CUresult Result = cuDeviceGetAttribute(
+ &ComputeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+ Device))
+ return CUresultToError(
+ Result,
+ "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR");
+ if (CUresult Result = cuDeviceGetAttribute(
+ &ComputeCapabilityMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+ Device))
+ return CUresultToError(
+ Result,
+ "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR");
+ const char *Code = Spec.getCUDAPTXInMemory().getCode(ComputeCapabilityMajor,
+ ComputeCapabilityMinor);
+
+ if (!Code)
+ return make_error("no suitable CUDA source found for compute capability " +
+ llvm::Twine(ComputeCapabilityMajor) + "." +
+ llvm::Twine(ComputeCapabilityMinor));
+
+ CUmodule Module;
+ if (CUresult Result = cuModuleLoadData(&Module, Code))
+ return CUresultToError(Result, "cuModuleLoadData");
+
+ CUfunction Function;
+ if (CUresult Result =
+ cuModuleGetFunction(&Function, Module, Spec.getKernelName().c_str()))
+ return CUresultToError(Result, "cuModuleGetFunction");
+
+ // TODO(jhen): Should I save this function pointer in case someone asks for
+ // it again?
+
+ // TODO(jhen): Should I save the module pointer so I can unload it when I
+ // destroy this device?
+
+ return static_cast<const void *>(Function);
+}
+
+Error CUDAPlatformDevice::destroyKernel(const void *Handle) {
+ // TODO(jhen): Maybe keep track of kernels for each module and unload the
+ // module after they are all destroyed.
+ return Error::success();
+}
+
+Expected<const void *> CUDAPlatformDevice::createStream() {
+ CUstream Stream;
+ if (CUresult Result = cuStreamCreate(&Stream, CU_STREAM_DEFAULT))
+ return CUresultToError(Result, "cuStreamCreate");
+ return Stream;
+}
+
+Error CUDAPlatformDevice::destroyStream(const void *Handle) {
+ return CUresultToError(
+ cuStreamDestroy(static_cast<CUstream>(const_cast<void *>(Handle))),
+ "cuStreamDestroy");
+}
+
+Error CUDAPlatformDevice::launch(
+ const void *PlatformStreamHandle, BlockDimensions BlockSize,
+ GridDimensions GridSize, const void *PKernelHandle,
+ const PackedKernelArgumentArrayBase &ArgumentArray) {
+ CUfunction Function =
+ reinterpret_cast<CUfunction>(const_cast<void *>(PKernelHandle));
+ CUstream Stream =
+ reinterpret_cast<CUstream>(const_cast<void *>(PlatformStreamHandle));
+ // TODO(jhen): Deal with shared memory arguments.
+ unsigned SharedMemoryBytes = 0;
+ void **ArgumentAddresses = const_cast<void **>(ArgumentArray.getAddresses());
+ return CUresultToError(cuLaunchKernel(Function, GridSize.X, GridSize.Y,
+ GridSize.Z, BlockSize.X, BlockSize.Y,
+ BlockSize.Z, SharedMemoryBytes, Stream,
+ ArgumentAddresses, nullptr),
+ "cuLaunchKernel");
+}
+
+Error CUDAPlatformDevice::copyD2H(const void *PlatformStreamHandle,
+ const void *DeviceSrcHandle,
+ size_t SrcByteOffset, void *HostDst,
+ size_t DstByteOffset, size_t ByteCount) {
+ return CUresultToError(
+ cuMemcpyDtoHAsync(
+ offset(HostDst, DstByteOffset),
+ reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
+ ByteCount,
+ static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
+ "cuMemcpyDtoHAsync");
+}
+
+Error CUDAPlatformDevice::copyH2D(const void *PlatformStreamHandle,
+ const void *HostSrc, size_t SrcByteOffset,
+ const void *DeviceDstHandle,
+ size_t DstByteOffset, size_t ByteCount) {
+ return CUresultToError(
+ cuMemcpyHtoDAsync(
+ reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
+ offset(HostSrc, SrcByteOffset), ByteCount,
+ static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
+ "cuMemcpyHtoDAsync");
+}
+
+Error CUDAPlatformDevice::copyD2D(const void *PlatformStreamHandle,
+ const void *DeviceSrcHandle,
+ size_t SrcByteOffset,
+ const void *DeviceDstHandle,
+ size_t DstByteOffset, size_t ByteCount) {
+ return CUresultToError(
+ cuMemcpyDtoDAsync(
+ reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
+ reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
+ ByteCount,
+ static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
+ "cuMemcpyDtoDAsync");
+}
+
+Error CUDAPlatformDevice::blockHostUntilDone(const void *PlatformStreamHandle) {
+ return CUresultToError(cuStreamSynchronize(static_cast<CUstream>(
+ const_cast<void *>(PlatformStreamHandle))),
+ "cuStreamSynchronize");
+}
+
+Expected<void *> CUDAPlatformDevice::allocateDeviceMemory(size_t ByteCount) {
+ CUdeviceptr Pointer;
+ if (CUresult Result = cuMemAlloc(&Pointer, ByteCount))
+ return CUresultToError(Result, "cuMemAlloc");
+ return reinterpret_cast<void *>(Pointer);
+}
+
+Error CUDAPlatformDevice::freeDeviceMemory(const void *Handle) {
+ return CUresultToError(cuMemFree(reinterpret_cast<CUdeviceptr>(Handle)),
+ "cuMemFree");
+}
+
+Error CUDAPlatformDevice::registerHostMemory(void *Memory, size_t ByteCount) {
+ return CUresultToError(cuMemHostRegister(Memory, ByteCount, 0u),
+ "cuMemHostRegister");
+}
+
+Error CUDAPlatformDevice::unregisterHostMemory(const void *Memory) {
+ return CUresultToError(cuMemHostUnregister(const_cast<void *>(Memory)),
+ "cuMemHostUnregister");
+}
+
+Error CUDAPlatformDevice::synchronousCopyD2H(const void *DeviceSrcHandle,
+ size_t SrcByteOffset,
+ void *HostDst,
+ size_t DstByteOffset,
+ size_t ByteCount) {
+ return CUresultToError(cuMemcpyDtoH(offset(HostDst, DstByteOffset),
+ reinterpret_cast<CUdeviceptr>(offset(
+ DeviceSrcHandle, SrcByteOffset)),
+ ByteCount),
+ "cuMemcpyDtoH");
+}
+
+Error CUDAPlatformDevice::synchronousCopyH2D(const void *HostSrc,
+ size_t SrcByteOffset,
+ const void *DeviceDstHandle,
+ size_t DstByteOffset,
+ size_t ByteCount) {
+ return CUresultToError(
+ cuMemcpyHtoD(
+ reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
+ offset(HostSrc, SrcByteOffset), ByteCount),
+ "cuMemcpyHtoD");
+}
+
+Error CUDAPlatformDevice::synchronousCopyD2D(const void *DeviceDstHandle,
+ size_t DstByteOffset,
+ const void *DeviceSrcHandle,
+ size_t SrcByteOffset,
+ size_t ByteCount) {
+ return CUresultToError(
+ cuMemcpyDtoD(
+ reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
+ reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
+ ByteCount),
+ "cuMemcpyDtoD");
+}
+
+} // namespace cuda
+} // namespace streamexecutor
diff --git a/parallel-libs/streamexecutor/lib/platforms/cuda/cmake/modules/FindLibcuda.cmake b/parallel-libs/streamexecutor/lib/platforms/cuda/cmake/modules/FindLibcuda.cmake
new file mode 100644
index 000000000000..6572ac09caa2
--- /dev/null
+++ b/parallel-libs/streamexecutor/lib/platforms/cuda/cmake/modules/FindLibcuda.cmake
@@ -0,0 +1,21 @@
+# - Try to find the libcuda library
+# Once done this will define
+# LIBCUDA_FOUND - System has libcuda
+# LIBCUDA_INCLUDE_DIRS - The libcuda include directories
+# LIBCUDA_LIBRARIES - The libraries needed to use libcuda
+
+# TODO(jhen): Allow users to specify a search path.
+find_path(LIBCUDA_INCLUDE_DIR cuda.h /usr/local/cuda/include)
+# TODO(jhen): Use the library that goes with the headers.
+find_library(LIBCUDA_LIBRARY cuda)
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set LIBCUDA_FOUND to TRUE if
+# all listed variables are TRUE
+find_package_handle_standard_args(
+ LIBCUDA DEFAULT_MSG LIBCUDA_INCLUDE_DIR LIBCUDA_LIBRARY)
+
+mark_as_advanced(LIBCUDA_INCLUDE_DIR LIBCUDA_LIBRARY)
+
+set(LIBCUDA_LIBRARIES ${LIBCUDA_LIBRARY})
+set(LIBCUDA_INCLUDE_DIRS ${LIBCUDA_INCLUDE_DIR})