MLIR 22.0.0git
GPUHeuristics.h
Go to the documentation of this file.
1//===- GPUHeuristics.h - GPU heuristics for Linalg transforms ---*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
10#define MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
11
12#include "mlir/IR/Attributes.h"
13#include "mlir/IR/MLIRContext.h"
14
15namespace mlir {
16namespace transform {
17namespace gpu {
18
19/// Base struct to hold GPU mapping information for a given operation.
21 /// Number of threads to use for the mapping.
22 /// Note: When the number of threads used is smaller than the total number of
23 /// available threads, predication ensues. It is often useful to use more
24 /// threads and saturate memory bandwidth for some operations, even if others
25 /// end up being predicated.
27
28 /// Thread mapping attributes, one per entry of `numThreads`.
30};
31
33 /// Status of the mapping computation, invalid usually means too many threads
34 /// are required and we fail to map. This usually happens when the copy is too
35 /// large compared to the number of threads.
37
38 /// Greedily compute the MappingInfo to use to perform a copy of `sizes`
39 /// elements of bitwidth `elementalBitwidth`.
40 /// The `desiredBitAlignment` is the number of elements by which the most
41 /// minor dimension of the copy is expected to be aligned.
42 /// This is an approximation of the final alignment, for each row of the copy.
43 /// This is used to restrict the size of copied vector so that they match
44 /// potential subsequent cp.async.
45 /// If the alignment does not match the required alignment for a cp.async down
46 /// the line, the conversion to cp.async will be eventually skipped, possibly
47 /// degrading performance.
48 /// When `favorPredication` is false, the mapping is computed to fill all
49 /// threads with an equal amount of data to copy, so as to avoid predication.
50 /// Predication ends up requiring a split epilogue in current pipelining
51 /// implementations and is better avoided when possible.
52 CopyMappingInfo(MLIRContext *ctx, int totalNumThreads,
53 int64_t desiredBitAlignment, ArrayRef<int64_t> sizes,
54 bool favorPredication = false,
55 int64_t elementalBitwidth = 32);
56
57private:
58 /// Determine the maximal vector size to use to copy a contiguous array of
59 /// `numContiguousElements`, each of bitwidth `elementalBitwidth`.
60 /// The `alignment` is the number of elements by which the most minor
61 /// dimension of the copy is aligned. This is an approximation of actual
62 /// memory alignment after bufferization, for each row of the copy. This is
63 /// used to restrict the of the copied vector so that it is properly aligned
64 /// with the requirements of cp.async. If the copy alignment does not match
65 /// the required aligned for a cp.async, thae conversion to cp.async will be
66 /// skipped.
67 /// Asserts that `elementalBitwidth` divides `numContiguousElements`.
68 static int64_t
69 maxContiguousElementsToTransfer(int64_t alignment,
70 int64_t numContiguousElements,
71 int64_t elementalBitwidth = 32);
72
73 /// Compute the number of threads to use to perform a copy of `sizes`
74 /// elements of `elementalBitwidth`.
75 /// The `alignment` is the number of elements by which the most minor
76 /// dimension of the copy is aligned. This is an approximation of actual
77 /// memory alignment after bufferization, for each row of the copy. This is
78 /// used to restrict the of the copied vector so that it is properly aligned
79 /// with the requirements of cp.async. If the copy alignment does not match
80 /// the required aligned for a cp.async, the conversion to cp.async will be
81 /// skipped.
82 /// When `favorPredication` is false, the implementation avoids predication
83 /// in the copy, even if it means reducing the granularity of the transfer.
84 /// Otherwise, the implementation will come up with a maximal assignment of
85 /// the remaining threads to sizes of interest, using a DP implementation.
86 Status inferNumThreads(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
87 int64_t desiredVectorSize, bool favorPredication);
88 Status inferNumThreadsImpl(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
89 int64_t desiredVectorSize);
90
91public:
92 // Pretty-printing and diagnostic methods.
93 void print(llvm::raw_ostream &os) const;
94 LLVM_DUMP_METHOD void dump() const;
95
96 /// Static quantity determining the number of bits to target in an individual
97 /// copy. Assumes that smaller increments of 64, 32, 16, 8 are also valid
98 /// transfer sizes. In the future we should have more hardware pluggability
99 /// here, especially when we want sub-byte granularity
100 static constexpr int64_t kMaxVectorLoadBitWidth = 128;
101
102 /// Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
104
105 /// Number of threads to use for the copy mapping, from most major to most
106 /// minor dims (i.e. numThreads.back() should be mapped to contiguous threads
107 /// for best coalescing).
109
110 /// Explicit computation / injection of the smallest bounding tile sizes after
111 /// mapping to `numThreads`. This is useful in masked scenarios.
113
114 /// Thread mapping attributes, one per entry of `numThreads`.
116
117 /// The status of a particular copy mapping. Must be checked before applying
118 /// transformations.
120};
121
122inline llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
123 const CopyMappingInfo &info) {
124 info.print(os);
125 return os;
126}
127
128} // namespace gpu
129} // namespace transform
130} // namespace mlir
131
132#endif // MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
raw_ostream & operator<<(raw_ostream &os, const IdBuilderResult &res)
Definition Utils.h:41
Include the generated interface declarations.
Status
Status of the mapping computation, invalid usually means too many threads are required and we fail to...
int64_t vectorSize
Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
Status status
The status of a particular copy mapping.
LLVM_DUMP_METHOD void dump() const
CopyMappingInfo(MLIRContext *ctx, int totalNumThreads, int64_t desiredBitAlignment, ArrayRef< int64_t > sizes, bool favorPredication=false, int64_t elementalBitwidth=32)
Greedily compute the MappingInfo to use to perform a copy of sizes elements of bitwidth elementalBitw...
static constexpr int64_t kMaxVectorLoadBitWidth
Static quantity determining the number of bits to target in an individual copy.
void print(llvm::raw_ostream &os) const
SmallVector< int64_t > smallestBoundingTileSizes
Explicit computation / injection of the smallest bounding tile sizes after mapping to numThreads.
Base struct to hold GPU mapping information for a given operation.
SmallVector< Attribute > threadMapping
Thread mapping attributes, one per entry of numThreads.
SmallVector< int64_t > numThreads
Number of threads to use for the mapping.