diff options
Diffstat (limited to 'parallel-libs/streamexecutor/examples/CUDASaxpy.cpp')
-rw-r--r-- | parallel-libs/streamexecutor/examples/CUDASaxpy.cpp | 137 |
1 files changed, 137 insertions, 0 deletions
diff --git a/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp b/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp new file mode 100644 index 00000000000..eab0cbe69d6 --- /dev/null +++ b/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp @@ -0,0 +1,137 @@ +//===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains example code demonstrating the usage of the +/// StreamExecutor API. Snippets of this file will be included as code examples +/// in documentation. Taking these examples from a real source file guarantees +/// that the examples will always compile. +/// +//===----------------------------------------------------------------------===// + +#include <algorithm> +#include <cassert> +#include <cstdio> +#include <cstdlib> +#include <vector> + +#include "streamexecutor/StreamExecutor.h" + +/// [Example saxpy compiler-generated] +// Code in this namespace is generated by the compiler (e.g. clang). +// +// The name of this namespace may depend on the compiler that generated it, so +// this is just an example name. +namespace __compilergen { + +// Specialization of the streamexecutor::Kernel template class for the parameter +// types of the saxpy(float A, float *X, float *Y) kernel. +using SaxpyKernel = + streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>, + streamexecutor::GlobalDeviceMemory<float>>; + +// A string containing the PTX code generated by the device compiler for the +// saxpy kernel. String contents not shown here. +extern const char *SaxpyPTX; + +// A global instance of a loader spec that knows how to load the code in the +// SaxpyPTX string. +static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { + streamexecutor::MultiKernelLoaderSpec Spec; + Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}}); + return Spec; +}(); + +} // namespace __compilergen +/// [Example saxpy compiler-generated] + +/// [Example saxpy host PTX] +// The PTX text for a saxpy kernel. +const char *__compilergen::SaxpyPTX = R"( + .version 4.3 + .target sm_20 + .address_size 64 + + .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) { + .reg .f32 %AValue; + .reg .f32 %XValue; + .reg .f32 %YValue; + .reg .f32 %Result; + + .reg .b64 %XBaseAddrGeneric; + .reg .b64 %YBaseAddrGeneric; + .reg .b64 %XBaseAddrGlobal; + .reg .b64 %YBaseAddrGlobal; + .reg .b64 %XAddr; + .reg .b64 %YAddr; + .reg .b64 %ThreadByteOffset; + + .reg .b32 %TID; + + ld.param.f32 %AValue, [A]; + ld.param.u64 %XBaseAddrGeneric, [X]; + ld.param.u64 %YBaseAddrGeneric, [Y]; + cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric; + cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric; + mov.u32 %TID, %tid.x; + mul.wide.u32 %ThreadByteOffset, %TID, 4; + add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal; + add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal; + ld.global.f32 %XValue, [%XAddr]; + ld.global.f32 %YValue, [%YAddr]; + fma.rn.f32 %Result, %AValue, %XValue, %YValue; + st.global.f32 [%XAddr], %Result; + ret; + } +)"; +/// [Example saxpy host PTX] + +int main() { + /// [Example saxpy host main] + namespace se = ::streamexecutor; + namespace cg = ::__compilergen; + + // Create some host data. + float A = 42.0f; + std::vector<float> HostX = {0, 1, 2, 3}; + std::vector<float> HostY = {4, 5, 6, 7}; + size_t ArraySize = HostX.size(); + + // Get a device object. + se::Platform *Platform = + getOrDie(se::PlatformManager::getPlatformByName("CUDA")); + if (Platform->getDeviceCount() == 0) { + return EXIT_FAILURE; + } + se::Device *Device = getOrDie(Platform->getDevice(0)); + + // Load the kernel onto the device. + cg::SaxpyKernel Kernel = + getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec)); + + // Allocate memory on the device. + se::GlobalDeviceMemory<float> X = + getOrDie(Device->allocateDeviceMemory<float>(ArraySize)); + se::GlobalDeviceMemory<float> Y = + getOrDie(Device->allocateDeviceMemory<float>(ArraySize)); + + // Run operations on a stream. + se::Stream Stream = getOrDie(Device->createStream()); + Stream.thenCopyH2D<float>(HostX, X) + .thenCopyH2D<float>(HostY, Y) + .thenLaunch(ArraySize, 1, Kernel, A, X, Y) + .thenCopyD2H<float>(X, HostX); + // Wait for the stream to complete. + se::dieIfError(Stream.blockHostUntilDone()); + + // Process output data in HostX. + std::vector<float> ExpectedX = {4, 47, 90, 133}; + assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin())); + /// [Example saxpy host main] +} |