MLIR  20.0.0git
GPUHeuristics.h
Go to the documentation of this file.
1 //===- GPUHeuristics.h - GPU heuristics for Linalg transforms ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
10 #define MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
11 
12 #include "mlir/IR/Attributes.h"
13 #include "mlir/IR/MLIRContext.h"
14 
15 namespace mlir {
16 namespace transform {
17 namespace gpu {
18 
19 /// Base struct to hold GPU mapping information for a given operation.
20 struct MappingInfo {
21  /// Number of threads to use for the mapping.
22  /// Note: When the number of threads used is smaller than the total number of
23  /// available threads, predication ensues. It is often useful to use more
24  /// threads and saturate memory bandwidth for some operations, even if others
25  /// end up being predicated.
27 
28  /// Thread mapping attributes, one per entry of `numThreads`.
30 };
31 
32 struct CopyMappingInfo : public MappingInfo {
33  /// Status of the mapping computation, invalid usually means too many threads
34  /// are required and we fail to map. This usually happens when the copy is too
35  /// large compared to the number of threads.
36  enum class Status { Success = 0, RequiresPredication, Invalid };
37 
38  /// Greedily compute the MappingInfo to use to perform a copy of `sizes`
39  /// elements of bitwidth `elementalBitwidth`.
40  /// The `desiredBitAlignment` is the number of elements by which the most
41  /// minor dimension of the copy is expected to be aligned.
42  /// This is an approximation of the final alignment, for each row of the copy.
43  /// This is used to restrict the size of copied vector so that they match
44  /// potential subsequent cp.async.
45  /// If the alignment does not match the required alignment for a cp.async down
46  /// the line, the conversion to cp.async will be eventually skipped, possibly
47  /// degrading performance.
48  /// When `favorPredication` is false, the mapping is computed to fill all
49  /// threads with an equal amount of data to copy, so as to avoid predication.
50  /// Predication ends up requiring a split epilogue in current pipelining
51  /// implementations and is better avoided when possible.
52  CopyMappingInfo(MLIRContext *ctx, int totalNumThreads,
53  int64_t desiredBitAlignment, ArrayRef<int64_t> sizes,
54  bool favorPredication = false,
55  int64_t elementalBitwidth = 32);
56 
57 private:
58  /// Determine the maximal vector size to use to copy a contiguous array of
59  /// `numContiguousElements`, each of bitwidth `elementalBitwidth`.
60  /// The `alignment` is the number of elements by which the most minor
61  /// dimension of the copy is aligned. This is an approximation of actual
62  /// memory alignment after bufferization, for each row of the copy. This is
63  /// used to restrict the of the copied vector so that it is properly aligned
64  /// with the requirements of cp.async. If the copy alignment does not match
65  /// the required aligned for a cp.async, thae conversion to cp.async will be
66  /// skipped.
67  /// Asserts that `elementalBitwidth` divides `numContiguousElements`.
68  static int64_t
69  maxContiguousElementsToTransfer(int64_t alignment,
70  int64_t numContiguousElements,
71  int64_t elementalBitwidth = 32);
72 
73  /// Compute the number of threads to use to perform a copy of `sizes`
74  /// elements of `elementalBitwidth`.
75  /// The `alignment` is the number of elements by which the most minor
76  /// dimension of the copy is aligned. This is an approximation of actual
77  /// memory alignment after bufferization, for each row of the copy. This is
78  /// used to restrict the of the copied vector so that it is properly aligned
79  /// with the requirements of cp.async. If the copy alignment does not match
80  /// the required aligned for a cp.async, the conversion to cp.async will be
81  /// skipped.
82  /// When `favorPredication` is false, the implementation avoids predication
83  /// in the copy, even if it means reducing the granularity of the transfer.
84  /// Otherwise, the implementation will come up with a maximal assignment of
85  /// the remaining threads to sizes of interest, using a DP implementation.
86  Status inferNumThreads(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
87  int64_t desiredVectorSize, bool favorPredication);
88  Status inferNumThreadsImpl(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
89  int64_t desiredVectorSize);
90 
91 public:
92  // Pretty-printing and diagnostic methods.
93  void print(llvm::raw_ostream &os) const;
94  LLVM_DUMP_METHOD void dump() const;
95 
96  /// Static quantity determining the number of bits to target in an individual
97  /// copy. Assumes that smaller increments of 64, 32, 16, 8 are also valid
98  /// transfer sizes. In the future we should have more hardware pluggability
99  /// here, especially when we want sub-byte granularity
100  static constexpr int64_t kMaxVectorLoadBitWidth = 128;
101 
102  /// Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
103  int64_t vectorSize;
104 
105  /// Number of threads to use for the copy mapping, from most major to most
106  /// minor dims (i.e. numThreads.back() should be mapped to contiguous threads
107  /// for best coalescing).
109 
110  /// Explicit computation / injection of the smallest bounding tile sizes after
111  /// mapping to `numThreads`. This is useful in masked scenarios.
113 
114  /// Thread mapping attributes, one per entry of `numThreads`.
116 
117  /// The status of a particular copy mapping. Must be checked before applying
118  /// transformations.
120 };
121 
122 } // namespace gpu
123 } // namespace transform
124 } // namespace mlir
125 
126 #endif // MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
Include the generated interface declarations.
Status
Status of the mapping computation, invalid usually means too many threads are required and we fail to...
Definition: GPUHeuristics.h:36
int64_t vectorSize
Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
Status status
The status of a particular copy mapping.
LLVM_DUMP_METHOD void dump() const
CopyMappingInfo(MLIRContext *ctx, int totalNumThreads, int64_t desiredBitAlignment, ArrayRef< int64_t > sizes, bool favorPredication=false, int64_t elementalBitwidth=32)
Greedily compute the MappingInfo to use to perform a copy of sizes elements of bitwidth elementalBitw...
static constexpr int64_t kMaxVectorLoadBitWidth
Static quantity determining the number of bits to target in an individual copy.
void print(llvm::raw_ostream &os) const
SmallVector< int64_t > smallestBoundingTileSizes
Explicit computation / injection of the smallest bounding tile sizes after mapping to numThreads.
Base struct to hold GPU mapping information for a given operation.
Definition: GPUHeuristics.h:20
SmallVector< Attribute > threadMapping
Thread mapping attributes, one per entry of numThreads.
Definition: GPUHeuristics.h:29
SmallVector< int64_t > numThreads
Number of threads to use for the mapping.
Definition: GPUHeuristics.h:26