MLIR  21.0.0git
GPUHeuristics.cpp
Go to the documentation of this file.
1 //===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 
12 #include "llvm/ADT/ArrayRef.h"
13 #include "llvm/ADT/STLExtras.h"
14 #include "llvm/Support/CommandLine.h"
15 #include "llvm/Support/Debug.h"
16 #include "llvm/Support/InterleavedRange.h"
17 #include "llvm/Support/MathExtras.h"
18 #include "llvm/Support/raw_ostream.h"
19 #include <cmath>
20 #include <numeric>
21 
22 using namespace mlir;
23 
24 #define DEBUG_TYPE "linalg-transforms"
25 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
26 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
27 
29  return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);
30 }
32  return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);
33 }
35  return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);
36 }
37 
39  int totalNumThreads,
40  int64_t desiredBitAlignment,
41  ArrayRef<int64_t> copySizes,
42  bool favorPredication,
43  int64_t elementalBitwidth) {
44  assert(!copySizes.empty() && copySizes.size() <= 3 &&
45  "only 1,2,3-D copies are supported for now");
46 
47  LDBG("START CopyMappingInfo, favorPredication: " << favorPredication);
48  LLVM_DEBUG(DBGS() << "--copy shape: " << llvm::interleaved(copySizes)
49  << "\n");
50 
51  // Greedily find the largest vector size that can be used to copy the most
52  // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth
53  // contiguous memory transactions with as few threads as possible.
54  int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(
55  desiredBitAlignment, copySizes.back(), elementalBitwidth);
56 
57  LDBG("--greedily determined vectorSize: "
58  << desiredVectorSize << " elements of " << elementalBitwidth
59  << "b each -> " << (desiredVectorSize * elementalBitwidth)
60  << "b total out of a max of " << kMaxVectorLoadBitWidth << "b");
61 
62  status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize,
63  favorPredication);
64  if (status == Status::Invalid)
65  return;
66 
67  LLVM_DEBUG(DBGS() << "--copy: " << llvm::interleaved(copySizes) << "\n"
68  << "--numThreads: " << llvm::interleaved(this->numThreads)
69  << "\n"
70  << "--vectorSize: " << this->vectorSize << "\n");
71  assert(this->numThreads.size() == copySizes.size() &&
72  "compute copy mapping expected same number of threads and copy sizes");
73 
74  // Compute the smallest bounding box.
75  this->smallestBoundingTileSizes = llvm::to_vector(
76  llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) {
77  int64_t size, numThreads;
78  std::tie(size, numThreads) = pair;
79  return llvm::divideCeilSigned(size, numThreads);
80  }));
81  SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),
82  linearId0(ctx)};
83 
84  // Set the thread mapping.
85  this->threadMapping =
86  llvm::to_vector(ArrayRef(allThreadMappings)
87  .take_back(this->smallestBoundingTileSizes.size()));
88  LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");
89 }
90 
91 int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
92  int64_t desiredBitAlignment, int64_t numContiguousElements,
93  int64_t elementalBitwidth) {
94  assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&
95  "elemental bitwidth does not divide kMaxVectorLoadBitWidth");
96  assert(desiredBitAlignment % elementalBitwidth == 0 &&
97  "elemental bitwidth does not divide desired bit alignment");
98  return std::gcd(
99  std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements),
100  kMaxVectorLoadBitWidth / elementalBitwidth);
101 }
102 
103 /// Get the list of all factors that divide `val`, not just the prime factors.
104 static SmallVector<int64_t> getFactors(int64_t val) {
105  SmallVector<int64_t> factors;
106  factors.reserve(val);
107  for (int64_t factor = 1; factor <= val; ++factor) {
108  if (val % factor != 0)
109  continue;
110  factors.push_back(factor);
111  }
112  factors.push_back(val);
113  return factors;
114 }
115 
116 static int64_t product(ArrayRef<int64_t> vals) {
117  int64_t res = 1;
118  for (auto val : vals)
119  res *= val;
120  return res;
121 }
122 
123 /// Extract `result` from `sizes` with the following constraints:
124 /// 1. sizes[i] % result[i] for all i
125 /// 2. product_of_threadsPerDim <= maxNumThreads
126 /// 3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
127 /// must be sizes[currentIndex].
128 /// This is used to greedily extract the maximum number of threads usable for
129 /// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
130 /// ensuring coalesced access along the most minor dimension.
131 /// Return the number of threads used in the range:
132 /// threadsPerDim[currentIndex .. sizes.end()]
133 // The implementation uses a dynamic programming approach to greedily extract
134 // the best combination under the constraints.
135 // TODO: Implementation details can be improved but putting effort there is a
136 // tradeoffs: `sizes` is expected to be of small rank and contain small values.
138  int64_t currentIndex,
139  int64_t maxNumThreads) {
140  assert(static_cast<size_t>(currentIndex) < sizes.size() &&
141  "currentIndex out of bounds");
142  std::string indent(2 * currentIndex, '-');
143  if (static_cast<size_t>(currentIndex) == sizes.size() - 1) {
144  LDBG(indent << "mandated globalBest: " << sizes[currentIndex]);
145  return SmallVector<int64_t>{sizes[currentIndex]};
146  }
147 
148  int64_t best = 0;
149  int64_t s = sizes[currentIndex];
150  SmallVector<int64_t> factors = getFactors(s);
151  SmallVector<int64_t> localThreadsPerDim;
152  localThreadsPerDim.reserve(sizes.size());
153  LDBG(indent << "maximizeNumThreads in " << s
154  << " with limit: " << maxNumThreads);
155  for (auto factor : factors) {
156  auto nestedThreadsPerDim =
157  maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor);
158  int64_t localBest = factor * product(nestedThreadsPerDim);
159  if (localBest > best && localBest <= maxNumThreads) {
160  LDBG(indent << "new localBest: " << localBest);
161  LDBG(indent << "nestedThreadsPerDim: "
162  << llvm::interleaved(nestedThreadsPerDim));
163  localThreadsPerDim.clear();
164  localThreadsPerDim.push_back(factor);
165  llvm::append_range(localThreadsPerDim, nestedThreadsPerDim);
166  best = localBest;
167  }
168  }
169 
170  LDBG(indent << "found globalBest: " << best);
171  LDBG(indent << "numThreads: " << llvm::interleaved(localThreadsPerDim));
172  return localThreadsPerDim;
173 }
174 
176 transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
177  ArrayRef<int64_t> sizes,
178  int64_t desiredVectorSize,
179  bool favorPredication) {
180 
181  if (!favorPredication) {
182  int64_t localVectorSize = desiredVectorSize;
183  for (; localVectorSize >= 1; localVectorSize /= 2) {
184  // Attempt to map the copy with predication and current fixed vector size:
185  // 1. if the status is Success, we are done.
186  // 2. if the status is Invalid, we fail immediately, no amount of
187  // vector size reduction can offset the bad tile size selection from the
188  // higher-level.
189  // 3. if the status is RequiresPredication, we try again with a smaller
190  // vector size.
191  Status status =
192  inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize);
193  if (status == Status::Success || status == Status::Invalid)
194  return status;
195 
196  LDBG("requires predication, try reducing vector size to "
197  << (localVectorSize / 2));
198  }
199  }
200 
201  // If we have not yet returned, it means that we have tried all vector sizes
202  // and we still require predication. Restart from the original vector size and
203  // do not attempt to
204  return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);
205 }
206 
208 transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
209  int64_t totalNumThreads, ArrayRef<int64_t> sizes,
210  int64_t desiredVectorSize) {
211  assert(sizes.back() % desiredVectorSize == 0 &&
212  "most-minor size not divisible by actualVectorSize");
213 
214  LDBG("inferNumThreadsImpl with totalNumThreads: "
215  << totalNumThreads << " and vectorSize: " << desiredVectorSize);
216 
217  // Scale the most minor size to account for the chosen vector size and
218  // maximize the number of threads without exceeding the total number of
219  // threads.
220  SmallVector<int64_t> scaledSizes(sizes);
221  scaledSizes.back() /= desiredVectorSize;
222  if (scaledSizes.back() > totalNumThreads) {
223  LDBG("--Too few threads given the required vector size -> FAIL");
224  return Status::Invalid;
225  }
226  SmallVector<int64_t> inferredNumThreads =
227  maximizeNumThreads(scaledSizes, 0, totalNumThreads);
228 
229  LDBG("inferred numThreads: " << llvm::interleaved(inferredNumThreads));
230  LDBG("computed actualVectorSize: " << desiredVectorSize);
231 
232  // Corner case: we cannot use more threads than available. If the dimension of
233  // the copy is so bad it is because higher-level tiling did not do its job, we
234  // do not try to recover from it here.
235  int64_t totalNumThreadsUsed = product(inferredNumThreads);
236  LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed);
237  if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) {
238  LDBG("--Too few threads given the required vector size -> FAIL");
239  return Status::Invalid;
240  }
241 
242  this->vectorSize = desiredVectorSize;
243  this->numThreads = inferredNumThreads;
244  if (totalNumThreadsUsed == totalNumThreads)
245  return Status::Success;
246 
247  return Status::RequiresPredication;
248 }
249 
250 void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {
251  os << "MappingInfo{"
252  << "CopyMappingInfo: " << "valid: " << (status != Status::Invalid) << ", "
253  << "vectorSize: " << vectorSize << ", numThreads: {"
254  << llvm::interleaved(numThreads) << "}, smallestBoundingTileSizes: {"
255  << llvm::interleaved(smallestBoundingTileSizes) << "}, threadMapping: {"
256  << llvm::interleaved(threadMapping) << "}}";
257 }
static SmallVector< int64_t > maximizeNumThreads(ArrayRef< int64_t > sizes, int64_t currentIndex, int64_t maxNumThreads)
Extract result from sizes with the following constraints:
static Attribute linearId1(MLIRContext *ctx)
static int64_t product(ArrayRef< int64_t > vals)
static SmallVector< int64_t > getFactors(int64_t val)
Get the list of all factors that divide val, not just the prime factors.
static Attribute linearId0(MLIRContext *ctx)
static Attribute linearId2(MLIRContext *ctx)
#define DBGS()
#define LDBG(X)
Attributes are known-constant values of operations.
Definition: Attributes.h:25
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
Include the generated interface declarations.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
SmallVector< Attribute > threadMapping
Thread mapping attributes, one per entry of numThreads.
Definition: GPUHeuristics.h:29
Status
Status of the mapping computation, invalid usually means too many threads are required and we fail to...
Definition: GPUHeuristics.h:36
int64_t vectorSize
Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
Status status
The status of a particular copy mapping.
CopyMappingInfo(MLIRContext *ctx, int totalNumThreads, int64_t desiredBitAlignment, ArrayRef< int64_t > sizes, bool favorPredication=false, int64_t elementalBitwidth=32)
Greedily compute the MappingInfo to use to perform a copy of sizes elements of bitwidth elementalBitw...
static constexpr int64_t kMaxVectorLoadBitWidth
Static quantity determining the number of bits to target in an individual copy.
void print(llvm::raw_ostream &os) const
SmallVector< int64_t > smallestBoundingTileSizes
Explicit computation / injection of the smallest bounding tile sizes after mapping to numThreads.
SmallVector< int64_t > numThreads
Number of threads to use for the copy mapping, from most major to most minor dims (i....
Definition: GPUHeuristics.h:26