MLIR  19.0.0git
GPUHeuristics.h
Go to the documentation of this file.
1 //===- GPUHeuristics.h - GPU heuristics for Linalg transforms ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
10 #define MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
11 
12 #include "mlir/IR/Attributes.h"
13 #include "mlir/IR/MLIRContext.h"
15 
16 namespace mlir {
17 namespace transform {
18 namespace gpu {
19 
20 /// Base struct to hold GPU mapping information for a given operation.
21 struct MappingInfo {
22  /// Number of threads to use for the mapping.
23  /// Note: When the number of threads used is smaller than the total number of
24  /// available threads, predication ensues. It is often useful to use more
25  /// threads and saturate memory bandwidth for some operations, even if others
26  /// end up being predicated.
28 
29  /// Thread mapping attributes, one per entry of `numThreads`.
31 };
32 
33 struct CopyMappingInfo : public MappingInfo {
34  /// Status of the mapping computation, invalid usually means too many threads
35  /// are required and we fail to map. This usually happens when the copy is too
36  /// large compared to the number of threads.
37  enum class Status { Success = 0, RequiresPredication, Invalid };
38 
39  /// Greedily compute the MappingInfo to use to perform a copy of `sizes`
40  /// elements of bitwidth `elementalBitwidth`.
41  /// The `desiredBitAlignment` is the number of elements by which the most
42  /// minor dimension of the copy is expected to be aligned.
43  /// This is an approximation of the final alignment, for each row of the copy.
44  /// This is used to restrict the size of copied vector so that they match
45  /// potential subsequent cp.async.
46  /// If the alignment does not match the required alignment for a cp.async down
47  /// the line, the conversion to cp.async will be eventually skipped, possibly
48  /// degrading performance.
49  /// When `favorPredication` is false, the mapping is computed to fill all
50  /// threads with an equal amount of data to copy, so as to avoid predication.
51  /// Predication ends up requiring a split epilogue in current pipelining
52  /// implementations and is better avoided when possible.
53  CopyMappingInfo(MLIRContext *ctx, int totalNumThreads,
54  int64_t desiredBitAlignment, ArrayRef<int64_t> sizes,
55  bool favorPredication = false,
56  int64_t elementalBitwidth = 32);
57 
58 private:
59  /// Determine the maximal vector size to use to copy a contiguous array of
60  /// `numContiguousElements`, each of bitwidth `elementalBitwidth`.
61  /// The `alignment` is the number of elements by which the most minor
62  /// dimension of the copy is aligned. This is an approximation of actual
63  /// memory alignment after bufferization, for each row of the copy. This is
64  /// used to restrict the of the copied vector so that it is properly aligned
65  /// with the requirements of cp.async. If the copy alignment does not match
66  /// the required aligned for a cp.async, thae conversion to cp.async will be
67  /// skipped.
68  /// Asserts that `elementalBitwidth` divides `numContiguousElements`.
69  static int64_t
70  maxContiguousElementsToTransfer(int64_t alignment,
71  int64_t numContiguousElements,
72  int64_t elementalBitwidth = 32);
73 
74  /// Compute the number of threads to use to perform a copy of `sizes`
75  /// elements of `elementalBitwidth`.
76  /// The `alignment` is the number of elements by which the most minor
77  /// dimension of the copy is aligned. This is an approximation of actual
78  /// memory alignment after bufferization, for each row of the copy. This is
79  /// used to restrict the of the copied vector so that it is properly aligned
80  /// with the requirements of cp.async. If the copy alignment does not match
81  /// the required aligned for a cp.async, the conversion to cp.async will be
82  /// skipped.
83  /// When `favorPredication` is false, the implementation avoids predication
84  /// in the copy, even if it means reducing the granularity of the transfer.
85  /// Otherwise, the implementation will come up with a maximal assignment of
86  /// the remaining threads to sizes of interest, using a DP implementation.
87  Status inferNumThreads(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
88  int64_t desiredVectorSize, bool favorPredication);
89  Status inferNumThreadsImpl(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
90  int64_t desiredVectorSize);
91 
92 public:
93  // Pretty-printing and diagnostic methods.
94  void print(llvm::raw_ostream &os) const;
95  LLVM_DUMP_METHOD void dump() const;
96 
97  /// Static quantity determining the number of bits to target in an individual
98  /// copy. Assumes that smaller increments of 64, 32, 16, 8 are also valid
99  /// transfer sizes. In the future we should have more hardware pluggability
100  /// here, especially when we want sub-byte granularity
101  static constexpr int64_t kMaxVectorLoadBitWidth = 128;
102 
103  /// Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
104  int64_t vectorSize;
105 
106  /// Number of threads to use for the copy mapping, from most major to most
107  /// minor dims (i.e. numThreads.back() should be mapped to contiguous threads
108  /// for best coalescing).
110 
111  /// Explicit computation / injection of the smallest bounding tile sizes after
112  /// mapping to `numThreads`. This is useful in masked scenarios.
114 
115  /// Thread mapping attributes, one per entry of `numThreads`.
117 
118  /// The status of a particular copy mapping. Must be checked before applying
119  /// transformations.
121 };
122 
123 } // namespace gpu
124 } // namespace transform
125 } // namespace mlir
126 
127 #endif // MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
Include the generated interface declarations.
Status
Status of the mapping computation, invalid usually means too many threads are required and we fail to...
Definition: GPUHeuristics.h:37
int64_t vectorSize
Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
Status status
The status of a particular copy mapping.
LLVM_DUMP_METHOD void dump() const
CopyMappingInfo(MLIRContext *ctx, int totalNumThreads, int64_t desiredBitAlignment, ArrayRef< int64_t > sizes, bool favorPredication=false, int64_t elementalBitwidth=32)
Greedily compute the MappingInfo to use to perform a copy of sizes elements of bitwidth elementalBitw...
static constexpr int64_t kMaxVectorLoadBitWidth
Static quantity determining the number of bits to target in an individual copy.
void print(llvm::raw_ostream &os) const
SmallVector< int64_t > smallestBoundingTileSizes
Explicit computation / injection of the smallest bounding tile sizes after mapping to numThreads.
Base struct to hold GPU mapping information for a given operation.
Definition: GPUHeuristics.h:21
SmallVector< Attribute > threadMapping
Thread mapping attributes, one per entry of numThreads.
Definition: GPUHeuristics.h:30
SmallVector< int64_t > numThreads
Number of threads to use for the mapping.
Definition: GPUHeuristics.h:27