parallel-libs/streamexecutor/include/streamexecutor/KernelSpec.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270

//===-- KernelSpec.h - Kernel loader spec types -----------------*- C++ -*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// KernelLoaderSpec is the base class for types that know where to find the
/// code for a data-parallel kernel in a particular format on a particular
/// platform. So, for example, there will be one subclass that deals with CUDA
/// PTX code, another subclass that deals with CUDA fatbin code, and yet another
/// subclass that deals with OpenCL text code.
///
/// A MultiKernelLoaderSpec is basically a collection of KernelLoaderSpec
/// instances. This is useful when code is available for the same kernel in
/// several different formats or targeted for several different platforms. All
/// the various KernelLoaderSpec instances for this kernel can be combined
/// together in one MultiKernelLoaderSpec and the specific platform consumer can
/// decide which instance of the code it wants to use.
///
/// MultiKernelLoaderSpec provides several helper functions to build and
/// register KernelLoaderSpec instances all in a single operation. For example,
/// MultiKernelLoaderSpec::addCUDAPTXInMemory can be used to construct and
/// register a CUDAPTXInMemorySpec KernelLoaderSpec.
///
/// The loader spec classes declared here are designed primarily to be
/// instantiated by the compiler, but they can also be instantiated directly by
/// the user. A simplified example workflow which a compiler might follow in the
/// case of a CUDA kernel that is compiled to CUDA fatbin code is as follows:
///
/// 1. The user defines a kernel function called \c UserKernel.
/// 2. The compiler compiles the kernel code into CUDA fatbin data and embeds
///    that data into the host code at address \c __UserKernelFatbinAddress.
/// 3. The compiler adds code at the beginning of the host code to instantiate a
///    MultiKernelLoaderSpec:
///    \code
///    namespace compiler_cuda_namespace {
///      MultiKernelLoaderSpec UserKernelLoaderSpec;
///    } // namespace compiler_cuda_namespace
///    \endcode
/// 4. The compiler then adds code to the host code to add the fatbin data to
///    the new MultiKernelLoaderSpec, and to associate that data with the kernel
///    name \c "UserKernel":
///    \code
///    namespace compiler_cuda_namespace {
///      UserKernelLoaderSpec.addCUDAFatbinInMemory(
///        __UserKernelFatbinAddress, "UserKernel");
///    } // namespace compiler_cuda_namespace
///    \endcode
/// 5. The host code, having known beforehand that the compiler would initialize
///    a MultiKernelLoaderSpec based on the name of the CUDA kernel, makes use
///    of the symbol \c cudanamespace::UserKernelLoaderSpec without defining it.
///
/// In the example above, the MultiKernelLoaderSpec instance created by the
/// compiler can be used by the host code to create StreamExecutor kernel
/// objects. In turn, those StreamExecutor kernel objects can be used by the
/// host code to launch the kernel on the device as desired.
///
//===----------------------------------------------------------------------===//

#ifndef STREAMEXECUTOR_KERNELSPEC_H
#define STREAMEXECUTOR_KERNELSPEC_H

#include <cassert>
#include <map>
#include <memory>
#include <string>

#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringRef.h"

namespace streamexecutor {

/// An object that knows how to find the code for a device kernel.
///
/// This is the base class for the hierarchy of loader specs. The different
/// subclasses know how to find code in different formats (e.g. CUDA PTX, OpenCL
/// binary).
///
/// This base class has functionality for storing and getting the name of the
/// kernel as a string.
class KernelLoaderSpec {
public:
  /// Returns the name of the kernel this spec loads.
  const std::string &getKernelName() const { return KernelName; }

protected:
  explicit KernelLoaderSpec(llvm::StringRef KernelName);

private:
  std::string KernelName;

  KernelLoaderSpec(const KernelLoaderSpec &) = delete;
  KernelLoaderSpec &operator=(const KernelLoaderSpec &) = delete;
};

/// A KernelLoaderSpec for CUDA PTX code that resides in memory as a
/// null-terminated string.
class CUDAPTXInMemorySpec : public KernelLoaderSpec {
public:
  /// First component is major version, second component is minor version.
  using ComputeCapability = std::pair<int, int>;

  /// PTX code combined with its compute capability.
  struct PTXSpec {
    ComputeCapability TheComputeCapability;
    const char *PTXCode;
  };

  /// Creates a CUDAPTXInMemorySpec from an array of PTXSpec objects.
  ///
  /// Adds each item in SpecList to this object.
  ///
  /// Does not take ownership of the PTXCode pointers in the SpecList elements.
  CUDAPTXInMemorySpec(
      llvm::StringRef KernelName,
      const llvm::ArrayRef<CUDAPTXInMemorySpec::PTXSpec> SpecList);

  /// Returns a pointer to the PTX code for the requested compute capability.
  ///
  /// Returns nullptr on failed lookup (if the requested compute capability is
  /// not available). Matches exactly the specified compute capability. Doesn't
  /// try to do anything smart like finding the next best compute capability if
  /// the specified capability cannot be found.
  const char *getCode(int ComputeCapabilityMajor,
                      int ComputeCapabilityMinor) const;

private:
  /// PTX code contents in memory.
  ///
  /// The key is a pair (cc_major, cc_minor), i.e., (2, 0), (3, 0), (3, 5).
  std::map<ComputeCapability, const char *> PTXByComputeCapability;

  CUDAPTXInMemorySpec(const CUDAPTXInMemorySpec &) = delete;
  CUDAPTXInMemorySpec &operator=(const CUDAPTXInMemorySpec &) = delete;
};

/// A KernelLoaderSpec for CUDA fatbin code that resides in memory.
class CUDAFatbinInMemorySpec : public KernelLoaderSpec {
public:
  /// Creates a CUDAFatbinInMemorySpec with a reference to the given fatbin
  /// bytes.
  ///
  /// Does not take ownership of the Bytes pointer.
  CUDAFatbinInMemorySpec(llvm::StringRef KernelName, const void *Bytes);

  /// Gets the fatbin data bytes.
  const void *getBytes() const { return Bytes; }

private:
  const void *Bytes;

  CUDAFatbinInMemorySpec(const CUDAFatbinInMemorySpec &) = delete;
  CUDAFatbinInMemorySpec &operator=(const CUDAFatbinInMemorySpec &) = delete;
};

/// A KernelLoaderSpec for OpenCL text that resides in memory as a
/// null-terminated string.
class OpenCLTextInMemorySpec : public KernelLoaderSpec {
public:
  /// Creates a OpenCLTextInMemorySpec with a reference to the given OpenCL text
  /// code bytes.
  ///
  /// Does not take ownership of the Text pointer.
  OpenCLTextInMemorySpec(llvm::StringRef KernelName, const char *Text);

  /// Returns the OpenCL text contents.
  const char *getText() const { return Text; }

private:
  const char *Text;

  OpenCLTextInMemorySpec(const OpenCLTextInMemorySpec &) = delete;
  OpenCLTextInMemorySpec &operator=(const OpenCLTextInMemorySpec &) = delete;
};

/// An object to store several different KernelLoaderSpecs for the same kernel.
///
/// This allows code in different formats and for different platforms to be
/// stored all together for a single kernel.
///
/// Various methods are available to add a new KernelLoaderSpec to a
/// MultiKernelLoaderSpec. There are also methods to query which formats and
/// platforms are supported by the currently added KernelLoaderSpec objects, and
/// methods to get the KernelLoaderSpec objects for each format and platform.
///
/// Since all stored KernelLoaderSpecs are supposed to reference the same
/// kernel, they are all assumed to take the same number and type of parameters,
/// but no checking is done to enforce this. In debug mode, all
/// KernelLoaderSpecs are checked to make sure they have the same kernel name,
/// so passing in specs with different kernel names can cause the program to
/// abort.
///
/// This interface is prone to errors, so it is better to leave
/// MultiKernelLoaderSpec creation and initialization to the compiler rather
/// than doing it by hand.
class MultiKernelLoaderSpec {
public:
  std::string getKernelName() const {
    if (TheKernelName)
      return *TheKernelName;
    return "";
  }

  // Convenience getters for testing whether these platform variants have
  // kernel loader specifications available.

  bool hasCUDAPTXInMemory() const { return TheCUDAPTXInMemorySpec != nullptr; }
  bool hasCUDAFatbinInMemory() const {
    return TheCUDAFatbinInMemorySpec != nullptr;
  }
  bool hasOpenCLTextInMemory() const {
    return TheOpenCLTextInMemorySpec != nullptr;
  }

  // Accessors for platform variant kernel load specifications.
  //
  // Precondition: corresponding has* method returns true.

  const CUDAPTXInMemorySpec &getCUDAPTXInMemory() const {
    assert(hasCUDAPTXInMemory() && "getting spec that is not present");
    return *TheCUDAPTXInMemorySpec;
  }
  const CUDAFatbinInMemorySpec &getCUDAFatbinInMemory() const {
    assert(hasCUDAFatbinInMemory() && "getting spec that is not present");
    return *TheCUDAFatbinInMemorySpec;
  }
  const OpenCLTextInMemorySpec &getOpenCLTextInMemory() const {
    assert(hasOpenCLTextInMemory() && "getting spec that is not present");
    return *TheOpenCLTextInMemorySpec;
  }

  // Builder-pattern-like methods for use in initializing a
  // MultiKernelLoaderSpec.
  //
  // Each of these should be used at most once for a single
  // MultiKernelLoaderSpec object. See file comment for example usage.
  //
  // Note that the KernelName parameter must be consistent with the kernel in
  // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel
  // name may be mangled by the compiler if it is not declared extern "C".

  /// Does not take ownership of the PTXCode pointers in the SpecList elements.
  MultiKernelLoaderSpec &
  addCUDAPTXInMemory(llvm::StringRef KernelName,
                     llvm::ArrayRef<CUDAPTXInMemorySpec::PTXSpec> SpecList);

  /// Does not take ownership of the FatbinBytes pointer.
  MultiKernelLoaderSpec &addCUDAFatbinInMemory(llvm::StringRef KernelName,
                                               const void *FatbinBytes);

  /// Does not take ownership of the OpenCLText pointer.
  MultiKernelLoaderSpec &addOpenCLTextInMemory(llvm::StringRef KernelName,
                                               const char *OpenCLText);

private:
  void setKernelName(llvm::StringRef KernelName);

  std::unique_ptr<std::string> TheKernelName;
  std::unique_ptr<CUDAPTXInMemorySpec> TheCUDAPTXInMemorySpec;
  std::unique_ptr<CUDAFatbinInMemorySpec> TheCUDAFatbinInMemorySpec;
  std::unique_ptr<OpenCLTextInMemorySpec> TheOpenCLTextInMemorySpec;
};

} // namespace streamexecutor

#endif // STREAMEXECUTOR_KERNELSPEC_H