MLIR 23.0.0git
GPUHeuristics.cpp
Go to the documentation of this file.
1//===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10
12#include "llvm/ADT/ArrayRef.h"
13#include "llvm/ADT/STLExtras.h"
14#include "llvm/ADT/SmallVectorExtras.h"
15#include "llvm/Support/Debug.h"
16#include "llvm/Support/DebugLog.h"
17#include "llvm/Support/InterleavedRange.h"
18#include "llvm/Support/MathExtras.h"
19#include "llvm/Support/raw_ostream.h"
20#include <cmath>
21#include <numeric>
22
23using namespace mlir;
24
25#define DEBUG_TYPE "linalg-transforms"
26
28 return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);
29}
31 return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);
32}
34 return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);
35}
36
38 int totalNumThreads,
39 int64_t desiredBitAlignment,
40 ArrayRef<int64_t> copySizes,
41 bool favorPredication,
42 int64_t elementalBitwidth) {
43 assert(!copySizes.empty() && copySizes.size() <= 3 &&
44 "only 1,2,3-D copies are supported for now");
45
46 LDBG() << "START CopyMappingInfo, favorPredication: " << favorPredication;
47 LDBG() << "--copy shape: " << llvm::interleaved(copySizes);
48
49 // Greedily find the largest vector size that can be used to copy the most
50 // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth
51 // contiguous memory transactions with as few threads as possible.
52 int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(
53 desiredBitAlignment, copySizes.back(), elementalBitwidth);
54
55 LDBG() << "--greedily determined vectorSize: " << desiredVectorSize
56 << " elements of " << elementalBitwidth << "b each -> "
57 << (desiredVectorSize * elementalBitwidth)
58 << "b total out of a max of " << kMaxVectorLoadBitWidth << "b";
59
60 status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize,
61 favorPredication);
63 return;
64
65 LDBG() << "--copy: " << llvm::interleaved(copySizes) << "\n"
66 << "--numThreads: " << llvm::interleaved(this->numThreads) << "\n"
67 << "--vectorSize: " << this->vectorSize;
68 assert(this->numThreads.size() == copySizes.size() &&
69 "compute copy mapping expected same number of threads and copy sizes");
70
71 // Compute the smallest bounding box.
72 this->smallestBoundingTileSizes = llvm::map_to_vector(
73 llvm::zip(copySizes, this->numThreads), [](auto &&pair) {
74 int64_t size, numThreads;
75 std::tie(size, numThreads) = pair;
76 return llvm::divideCeilSigned(size, numThreads);
77 });
78 SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),
79 linearId0(ctx)};
80
81 // Set the thread mapping.
82 this->threadMapping =
83 llvm::to_vector(ArrayRef(allThreadMappings)
84 .take_back(this->smallestBoundingTileSizes.size()));
85 LDBG() << *this;
86}
87
88int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
89 int64_t desiredBitAlignment, int64_t numContiguousElements,
90 int64_t elementalBitwidth) {
91 assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&
92 "elemental bitwidth does not divide kMaxVectorLoadBitWidth");
93 assert(desiredBitAlignment % elementalBitwidth == 0 &&
94 "elemental bitwidth does not divide desired bit alignment");
95 return std::gcd(
96 std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements),
97 kMaxVectorLoadBitWidth / elementalBitwidth);
98}
99
100/// Get the list of all factors that divide `val`, not just the prime factors.
102 SmallVector<int64_t> factors;
103 factors.reserve(val);
104 for (int64_t factor = 1; factor <= val; ++factor) {
105 if (val % factor != 0)
106 continue;
107 factors.push_back(factor);
108 }
109 factors.push_back(val);
110 return factors;
111}
112
114 int64_t res = 1;
115 for (auto val : vals)
116 res *= val;
117 return res;
118}
119
120/// Extract `result` from `sizes` with the following constraints:
121/// 1. sizes[i] % result[i] for all i
122/// 2. product_of_threadsPerDim <= maxNumThreads
123/// 3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
124/// must be sizes[currentIndex].
125/// This is used to greedily extract the maximum number of threads usable for
126/// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
127/// ensuring coalesced access along the most minor dimension.
128/// Return the number of threads used in the range:
129/// threadsPerDim[currentIndex .. sizes.end()]
130// The implementation uses a dynamic programming approach to greedily extract
131// the best combination under the constraints.
132// TODO: Implementation details can be improved but putting effort there is a
133// tradeoffs: `sizes` is expected to be of small rank and contain small values.
135 int64_t currentIndex,
136 int64_t maxNumThreads) {
137 assert(static_cast<size_t>(currentIndex) < sizes.size() &&
138 "currentIndex out of bounds");
139 std::string indent(2 * currentIndex, '-');
140 if (static_cast<size_t>(currentIndex) == sizes.size() - 1) {
141 LDBG() << indent << "mandated globalBest: " << sizes[currentIndex];
142 return SmallVector<int64_t>{sizes[currentIndex]};
143 }
144
145 int64_t best = 0;
146 int64_t s = sizes[currentIndex];
147 SmallVector<int64_t> factors = getFactors(s);
148 SmallVector<int64_t> localThreadsPerDim;
149 localThreadsPerDim.reserve(sizes.size());
150 LDBG() << indent << "maximizeNumThreads in " << s
151 << " with limit: " << maxNumThreads;
152 for (auto factor : factors) {
153 auto nestedThreadsPerDim =
154 maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor);
155 int64_t localBest = factor * product(nestedThreadsPerDim);
156 if (localBest > best && localBest <= maxNumThreads) {
157 LDBG() << indent << "new localBest: " << localBest;
158 LDBG() << indent << "nestedThreadsPerDim: "
159 << llvm::interleaved(nestedThreadsPerDim);
160 localThreadsPerDim.clear();
161 localThreadsPerDim.push_back(factor);
162 llvm::append_range(localThreadsPerDim, nestedThreadsPerDim);
163 best = localBest;
164 }
165 }
166
167 LDBG() << indent << "found globalBest: " << best;
168 LDBG() << indent << "numThreads: " << llvm::interleaved(localThreadsPerDim);
169 return localThreadsPerDim;
170}
171
173transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
174 ArrayRef<int64_t> sizes,
175 int64_t desiredVectorSize,
176 bool favorPredication) {
177
178 if (!favorPredication) {
179 int64_t localVectorSize = desiredVectorSize;
180 for (; localVectorSize >= 1; localVectorSize /= 2) {
181 // Attempt to map the copy with predication and current fixed vector size:
182 // 1. if the status is Success, we are done.
183 // 2. if the status is Invalid, we fail immediately, no amount of
184 // vector size reduction can offset the bad tile size selection from the
185 // higher-level.
186 // 3. if the status is RequiresPredication, we try again with a smaller
187 // vector size.
188 Status status =
189 inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize);
190 if (status == Status::Success || status == Status::Invalid)
191 return status;
192
193 LDBG() << "requires predication, try reducing vector size to "
194 << (localVectorSize / 2);
195 }
196 }
197
198 // If we have not yet returned, it means that we have tried all vector sizes
199 // and we still require predication. Restart from the original vector size and
200 // do not attempt to
201 return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);
202}
203
205transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
206 int64_t totalNumThreads, ArrayRef<int64_t> sizes,
207 int64_t desiredVectorSize) {
208 assert(sizes.back() % desiredVectorSize == 0 &&
209 "most-minor size not divisible by actualVectorSize");
210
211 LDBG() << "inferNumThreadsImpl with totalNumThreads: " << totalNumThreads
212 << " and vectorSize: " << desiredVectorSize;
213
214 // Scale the most minor size to account for the chosen vector size and
215 // maximize the number of threads without exceeding the total number of
216 // threads.
217 SmallVector<int64_t> scaledSizes(sizes);
218 scaledSizes.back() /= desiredVectorSize;
219 if (scaledSizes.back() > totalNumThreads) {
220 LDBG() << "--Too few threads given the required vector size -> FAIL";
221 return Status::Invalid;
222 }
223 SmallVector<int64_t> inferredNumThreads =
224 maximizeNumThreads(scaledSizes, 0, totalNumThreads);
225
226 LDBG() << "inferred numThreads: " << llvm::interleaved(inferredNumThreads);
227 LDBG() << "computed actualVectorSize: " << desiredVectorSize;
228
229 // Corner case: we cannot use more threads than available. If the dimension of
230 // the copy is so bad it is because higher-level tiling did not do its job, we
231 // do not try to recover from it here.
232 int64_t totalNumThreadsUsed = product(inferredNumThreads);
233 LDBG() << "--totalNumThreadsUsed: " << totalNumThreadsUsed;
234 if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) {
235 LDBG() << "--Too few threads given the required vector size -> FAIL";
236 return Status::Invalid;
237 }
238
239 this->vectorSize = desiredVectorSize;
240 this->numThreads = inferredNumThreads;
241 if (totalNumThreadsUsed == totalNumThreads)
242 return Status::Success;
243
244 return Status::RequiresPredication;
245}
246
247void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {
248 os << "MappingInfo{"
249 << "CopyMappingInfo: " << "valid: " << (status != Status::Invalid) << ", "
250 << "vectorSize: " << vectorSize << ", numThreads: {"
251 << llvm::interleaved(numThreads) << "}, smallestBoundingTileSizes: {"
252 << llvm::interleaved(smallestBoundingTileSizes) << "}, threadMapping: {"
253 << llvm::interleaved(threadMapping) << "}}";
254}
static Attribute linearId1(MLIRContext *ctx)
static SmallVector< int64_t > getFactors(int64_t val)
Get the list of all factors that divide val, not just the prime factors.
static SmallVector< int64_t > maximizeNumThreads(ArrayRef< int64_t > sizes, int64_t currentIndex, int64_t maxNumThreads)
Extract result from sizes with the following constraints:
static int64_t product(ArrayRef< int64_t > vals)
static Attribute linearId0(MLIRContext *ctx)
static Attribute linearId2(MLIRContext *ctx)
Attributes are known-constant values of operations.
Definition Attributes.h:25
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
Include the generated interface declarations.
SmallVector< Attribute > threadMapping
Thread mapping attributes, one per entry of numThreads.
Status
Status of the mapping computation, invalid usually means too many threads are required and we fail to...
int64_t vectorSize
Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
Status status
The status of a particular copy mapping.
CopyMappingInfo(MLIRContext *ctx, int totalNumThreads, int64_t desiredBitAlignment, ArrayRef< int64_t > sizes, bool favorPredication=false, int64_t elementalBitwidth=32)
Greedily compute the MappingInfo to use to perform a copy of sizes elements of bitwidth elementalBitw...
static constexpr int64_t kMaxVectorLoadBitWidth
Static quantity determining the number of bits to target in an individual copy.
void print(llvm::raw_ostream &os) const
SmallVector< int64_t > smallestBoundingTileSizes
Explicit computation / injection of the smallest bounding tile sizes after mapping to numThreads.
SmallVector< int64_t > numThreads
Number of threads to use for the copy mapping, from most major to most minor dims (i....