summaryrefslogtreecommitdiff
path: root/parallel-libs
diff options
context:
space:
mode:
authorJason Henline <jhen@google.com>2016-08-31 19:02:44 +0000
committerJason Henline <jhen@google.com>2016-08-31 19:02:44 +0000
commitca587b64bec220f2cd21470beea4a1d4d7160ea5 (patch)
tree93891f08b98af06af571ab17426f5426c9678c0a /parallel-libs
parent4eafaaeaf7d92e4236dd6ce71ea254d3fe54c69e (diff)
[StreamExecutor] Add Doxygen main page
Reviewers: jlebar Subscribers: jprice, parallel_libs-commits Differential Revision: https://reviews.llvm.org/D24066
Diffstat (limited to 'parallel-libs')
-rw-r--r--parallel-libs/streamexecutor/CMakeLists.txt1
-rw-r--r--parallel-libs/streamexecutor/Doxyfile.in4
-rw-r--r--parallel-libs/streamexecutor/examples/CMakeLists.txt2
-rw-r--r--parallel-libs/streamexecutor/examples/Example.cpp163
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/Kernel.h2
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h5
-rw-r--r--parallel-libs/streamexecutor/include/streamexecutor/StreamExecutor.h71
7 files changed, 243 insertions, 5 deletions
diff --git a/parallel-libs/streamexecutor/CMakeLists.txt b/parallel-libs/streamexecutor/CMakeLists.txt
index bb440f00966..3fd9af404cc 100644
--- a/parallel-libs/streamexecutor/CMakeLists.txt
+++ b/parallel-libs/streamexecutor/CMakeLists.txt
@@ -62,6 +62,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-unused-parameter")
add_subdirectory(lib)
+add_subdirectory(examples)
if (STREAM_EXECUTOR_ENABLE_DOXYGEN)
find_package(Doxygen REQUIRED)
diff --git a/parallel-libs/streamexecutor/Doxyfile.in b/parallel-libs/streamexecutor/Doxyfile.in
index 6b5fe3c742e..931108de039 100644
--- a/parallel-libs/streamexecutor/Doxyfile.in
+++ b/parallel-libs/streamexecutor/Doxyfile.in
@@ -398,7 +398,7 @@ LOOKUP_CACHE_SIZE = 0
# normally produced when WARNINGS is set to YES.
# The default value is: NO.
-EXTRACT_ALL = NO
+EXTRACT_ALL = YES
# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
# be included in the documentation.
@@ -811,7 +811,7 @@ EXCLUDE_SYMBOLS =
# that contain example code fragments that are included (see the \include
# command).
-EXAMPLE_PATH =
+EXAMPLE_PATH = @CMAKE_CURRENT_SOURCE_DIR@
# If the value of the EXAMPLE_PATH tag contains directories, you can use the
# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
diff --git a/parallel-libs/streamexecutor/examples/CMakeLists.txt b/parallel-libs/streamexecutor/examples/CMakeLists.txt
new file mode 100644
index 00000000000..a5220526a2d
--- /dev/null
+++ b/parallel-libs/streamexecutor/examples/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(example Example.cpp)
+target_link_libraries(example streamexecutor)
diff --git a/parallel-libs/streamexecutor/examples/Example.cpp b/parallel-libs/streamexecutor/examples/Example.cpp
new file mode 100644
index 00000000000..5b0befc91be
--- /dev/null
+++ b/parallel-libs/streamexecutor/examples/Example.cpp
@@ -0,0 +1,163 @@
+//===-- Example.cpp - Example code for documentation ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains example code demonstrating the usage of the
+/// StreamExecutor API. Snippets of this file will be included as code examples
+/// in documentation. Taking these examples from a real source file guarantees
+/// that the examples will always compile.
+///
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <vector>
+
+#include "streamexecutor/StreamExecutor.h"
+
+/// [Example saxpy host helper functions]
+// Example handler for streamexecutor::Expected return values.
+template <typename T> T getOrDie(streamexecutor::Expected<T> &&E) {
+ if (!E) {
+ std::fprintf(stderr, "Error extracting an expected value: %s.\n",
+ streamexecutor::consumeAndGetMessage(E.takeError()).c_str());
+ std::exit(EXIT_FAILURE);
+ }
+ return std::move(*E);
+}
+
+// Example handler for streamexecutor::Error return values.
+void check(streamexecutor::Error &&E) {
+ if (E) {
+ std::fprintf(stderr, "Error encountered: %s.\n",
+ streamexecutor::consumeAndGetMessage(std::move(E)).c_str());
+ std::exit(EXIT_FAILURE);
+ }
+}
+/// [Example saxpy host helper functions]
+
+/// [Example saxpy compiler-generated]
+// Code in this namespace is generated by the compiler (e.g. clang).
+//
+// The name of this namespace may depend on the compiler that generated it, so
+// this is just an example name.
+namespace __compilergen {
+
+// Specialization of the streamexecutor::Kernel template class for the parameter
+// types of the saxpy(float A, float *X, float *Y) kernel.
+using SaxpyKernel =
+ streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
+ streamexecutor::GlobalDeviceMemory<float>>;
+
+// A string containing the PTX code generated by the device compiler for the
+// saxpy kernel. String contents not shown here.
+extern const char *SaxpyPTX;
+
+// A global instance of a loader spec that knows how to load the code in the
+// SaxpyPTX string.
+static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
+ streamexecutor::MultiKernelLoaderSpec Spec;
+ Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
+ return Spec;
+}();
+
+} // namespace __compilergen
+/// [Example saxpy compiler-generated]
+
+/// [Example saxpy host PTX]
+const char *__compilergen::SaxpyPTX = R"(
+ .version 4.3
+ .target sm_20
+ .address_size 64
+
+ .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
+ .reg .f32 %AValue;
+ .reg .f32 %XValue;
+ .reg .f32 %YValue;
+ .reg .f32 %Result;
+
+ .reg .b64 %XBaseAddrGeneric;
+ .reg .b64 %YBaseAddrGeneric;
+ .reg .b64 %XBaseAddrGlobal;
+ .reg .b64 %YBaseAddrGlobal;
+ .reg .b64 %XAddr;
+ .reg .b64 %YAddr;
+ .reg .b64 %ThreadByteOffset;
+
+ .reg .b32 %TID;
+
+ ld.param.f32 %AValue, [A];
+ ld.param.u64 %XBaseAddrGeneric, [X];
+ ld.param.u64 %YBaseAddrGeneric, [Y];
+ cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
+ cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
+ mov.u32 %TID, %tid.x;
+ mul.wide.u32 %ThreadByteOffset, %TID, 4;
+ add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
+ add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
+ ld.global.f32 %XValue, [%XAddr];
+ ld.global.f32 %YValue, [%YAddr];
+ fma.rn.f32 %Result, %AValue, %XValue, %YValue;
+ st.global.f32 [%XAddr], %Result;
+ ret;
+ }
+)";
+/// [Example saxpy host PTX]
+
+int main() {
+ /// [Example saxpy host main]
+ namespace se = ::streamexecutor;
+ namespace cg = ::__compilergen;
+
+ // Create some host data.
+ float A = 42.0f;
+ std::vector<float> HostX = {0, 1, 2, 3};
+ std::vector<float> HostY = {4, 5, 6, 7};
+ size_t ArraySize = HostX.size();
+
+ // Get a device object.
+ se::Platform *Platform =
+ getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
+ if (Platform->getDeviceCount() == 0) {
+ return EXIT_FAILURE;
+ }
+ se::Device *Device = getOrDie(Platform->getDevice(0));
+
+ // Load the kernel onto the device.
+ std::unique_ptr<cg::SaxpyKernel> Kernel =
+ getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
+
+ // Allocate memory on the device.
+ se::GlobalDeviceMemory<float> X =
+ getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
+ se::GlobalDeviceMemory<float> Y =
+ getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
+
+ // Run operations on a stream.
+ std::unique_ptr<se::Stream> Stream = getOrDie(Device->createStream());
+ Stream->thenCopyH2D<float>(HostX, X)
+ .thenCopyH2D<float>(HostY, Y)
+ .thenLaunch(ArraySize, 1, *Kernel, A, X, Y)
+ .thenCopyD2H<float>(X, HostX);
+ // Wait for the stream to complete.
+ check(Stream->blockHostUntilDone());
+
+ // Process output data in HostX.
+ std::vector<float> ExpectedX = {4, 47, 90, 133};
+ for (size_t I = 0; I < ArraySize; ++I) {
+ assert(HostX[I] == ExpectedX[I]);
+ }
+
+ // Free device memory.
+ check(Device->freeDeviceMemory(X));
+ check(Device->freeDeviceMemory(Y));
+ /// [Example saxpy host main]
+}
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/Kernel.h b/parallel-libs/streamexecutor/include/streamexecutor/Kernel.h
index 63d9c711425..eaf3db301a6 100644
--- a/parallel-libs/streamexecutor/include/streamexecutor/Kernel.h
+++ b/parallel-libs/streamexecutor/include/streamexecutor/Kernel.h
@@ -55,7 +55,7 @@
/// function as follows:
/// \code
/// namespace ccn = compiler_cuda_namespace;
-/// using KernelPtr = std::unique_ptr<cnn::SaxpyKernel>;
+/// using KernelPtr = std::unique_ptr<ccn::SaxpyKernel>;
/// // Assumes Device is a pointer to the Device on which to launch the
/// // kernel.
/// //
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h b/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h
index 14c0c105c21..e3d2beca04b 100644
--- a/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h
+++ b/parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h
@@ -115,8 +115,9 @@ public:
/// Adds each item in SpecList to this object.
///
/// Does not take ownership of the PTXCode pointers in the SpecList elements.
- CUDAPTXInMemorySpec(llvm::StringRef KernelName,
- const llvm::ArrayRef<PTXSpec> SpecList);
+ CUDAPTXInMemorySpec(
+ llvm::StringRef KernelName,
+ const llvm::ArrayRef<CUDAPTXInMemorySpec::PTXSpec> SpecList);
/// Returns a pointer to the PTX code for the requested compute capability.
///
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/StreamExecutor.h b/parallel-libs/streamexecutor/include/streamexecutor/StreamExecutor.h
new file mode 100644
index 00000000000..2d6a892f337
--- /dev/null
+++ b/parallel-libs/streamexecutor/include/streamexecutor/StreamExecutor.h
@@ -0,0 +1,71 @@
+//===-- StreamExecutor.h - Main include file for StreamExecutor -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/// \mainpage Getting Started
+///
+/// \b StreamExecutor is a wrapper around CUDA and OpenCL host-side programming
+/// models (runtimes). This abstraction cleanly permits host code to target
+/// either CUDA or OpenCL devices with identically-functioning data parallel
+/// kernels. It manages the execution of concurrent work targeting the
+/// accelerator, similar to a host-side Executor.
+///
+/// This version of StreamExecutor can be built either as a sub-project of the
+/// LLVM project or as a standalone project depending on LLVM as an external
+/// package.
+///
+/// Below is an example of the use of the StreamExecutor API:
+///
+/// \snippet examples/Example.cpp Example saxpy host main
+///
+/// In the example, a couple of handler functions are used to handle error
+/// return values in the StreamExecutor API:
+///
+/// \snippet examples/Example.cpp Example saxpy host helper functions
+///
+/// These are just example handler functions. A real application will likely
+/// want to define similar handlers of its own that log errors in an
+/// application-specific way, convert errors to the application's own
+/// error-handling framework, or try to recover from errors as appropriate.
+///
+/// The example also references some symbols from a compiler-generated
+/// namespace:
+///
+/// \snippet examples/Example.cpp Example saxpy compiler-generated
+///
+/// Instead of depending on the compiler to generate this code, you can
+/// technically write the code yourself, but this is not recommended because the
+/// code is very error-prone. For example, the template parameters for the
+/// Kernel specialization have to match the parameter types for the device
+/// kernel, and the MultiKernelLoaderSpec has to be initialized with valid
+/// device code for the kernel. Errors in this code will not show up until
+/// runtime, and may only show up as garbage output rather than an explicit
+/// error, which can be very hard to debug, so again, it is strongly advised not
+/// to write this code yourself.
+///
+/// The example compiler-generated code uses a PTX string in the source code to
+/// store the device code, but the device code can also be stored in other
+/// formats such as CUDA cubin and CUDA fatbin. Furthermore, device code can be
+/// stored for other platforms such as OpenCL, and StreamExecutor will pick the
+/// right device code at runtime based on the user's platform selection. See
+/// streamexecutor::MultiKernelLoaderSpec for details of how device code can be
+/// stored for different platforms, but again, the code to set up the
+/// MultiKernelLoaderSpec instance should be generated by the compiler if
+/// possible, not by the user.
+
+#ifndef STREAMEXECUTOR_STREAMEXECUTOR_H
+#define STREAMEXECUTOR_STREAMEXECUTOR_H
+
+#include "Device.h"
+#include "Kernel.h"
+#include "KernelSpec.h"
+#include "Platform.h"
+#include "PlatformManager.h"
+#include "Stream.h"
+
+#endif // STREAMEXECUTOR_STREAMEXECUTOR_H