12#include "llvm/ADT/ArrayRef.h"
13#include "llvm/ADT/STLExtras.h"
14#include "llvm/ADT/SmallVectorExtras.h"
15#include "llvm/Support/Debug.h"
16#include "llvm/Support/DebugLog.h"
17#include "llvm/Support/InterleavedRange.h"
18#include "llvm/Support/MathExtras.h"
19#include "llvm/Support/raw_ostream.h"
25#define DEBUG_TYPE "linalg-transforms"
28 return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);
31 return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);
34 return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);
41 bool favorPredication,
43 assert(!copySizes.empty() && copySizes.size() <= 3 &&
44 "only 1,2,3-D copies are supported for now");
46 LDBG() <<
"START CopyMappingInfo, favorPredication: " << favorPredication;
47 LDBG() <<
"--copy shape: " << llvm::interleaved(copySizes);
52 int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(
53 desiredBitAlignment, copySizes.back(), elementalBitwidth);
55 LDBG() <<
"--greedily determined vectorSize: " << desiredVectorSize
56 <<
" elements of " << elementalBitwidth <<
"b each -> "
57 << (desiredVectorSize * elementalBitwidth)
60 status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize,
65 LDBG() <<
"--copy: " << llvm::interleaved(copySizes) <<
"\n"
66 <<
"--numThreads: " << llvm::interleaved(this->
numThreads) <<
"\n"
68 assert(this->
numThreads.size() == copySizes.size() &&
69 "compute copy mapping expected same number of threads and copy sizes");
73 llvm::zip(copySizes, this->
numThreads), [](
auto &&pair) {
76 return llvm::divideCeilSigned(size,
numThreads);
83 llvm::to_vector(
ArrayRef(allThreadMappings)
88int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
91 assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&
92 "elemental bitwidth does not divide kMaxVectorLoadBitWidth");
93 assert(desiredBitAlignment % elementalBitwidth == 0 &&
94 "elemental bitwidth does not divide desired bit alignment");
96 std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements),
97 kMaxVectorLoadBitWidth / elementalBitwidth);
103 factors.reserve(val);
104 for (
int64_t factor = 1; factor <= val; ++factor) {
105 if (val % factor != 0)
107 factors.push_back(factor);
109 factors.push_back(val);
115 for (
auto val : vals)
137 assert(
static_cast<size_t>(currentIndex) < sizes.size() &&
138 "currentIndex out of bounds");
139 std::string indent(2 * currentIndex,
'-');
140 if (
static_cast<size_t>(currentIndex) == sizes.size() - 1) {
141 LDBG() << indent <<
"mandated globalBest: " << sizes[currentIndex];
146 int64_t s = sizes[currentIndex];
149 localThreadsPerDim.reserve(sizes.size());
150 LDBG() << indent <<
"maximizeNumThreads in " << s
151 <<
" with limit: " << maxNumThreads;
152 for (
auto factor : factors) {
153 auto nestedThreadsPerDim =
156 if (localBest > best && localBest <= maxNumThreads) {
157 LDBG() << indent <<
"new localBest: " << localBest;
158 LDBG() << indent <<
"nestedThreadsPerDim: "
159 << llvm::interleaved(nestedThreadsPerDim);
160 localThreadsPerDim.clear();
161 localThreadsPerDim.push_back(factor);
162 llvm::append_range(localThreadsPerDim, nestedThreadsPerDim);
167 LDBG() << indent <<
"found globalBest: " << best;
168 LDBG() << indent <<
"numThreads: " << llvm::interleaved(localThreadsPerDim);
169 return localThreadsPerDim;
173transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
174 ArrayRef<int64_t> sizes,
175 int64_t desiredVectorSize,
176 bool favorPredication) {
178 if (!favorPredication) {
179 int64_t localVectorSize = desiredVectorSize;
180 for (; localVectorSize >= 1; localVectorSize /= 2) {
189 inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize);
190 if (status == Status::Success || status == Status::Invalid)
193 LDBG() <<
"requires predication, try reducing vector size to "
194 << (localVectorSize / 2);
201 return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);
205transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
206 int64_t totalNumThreads, ArrayRef<int64_t> sizes,
207 int64_t desiredVectorSize) {
208 assert(sizes.back() % desiredVectorSize == 0 &&
209 "most-minor size not divisible by actualVectorSize");
211 LDBG() <<
"inferNumThreadsImpl with totalNumThreads: " << totalNumThreads
212 <<
" and vectorSize: " << desiredVectorSize;
217 SmallVector<int64_t> scaledSizes(sizes);
218 scaledSizes.back() /= desiredVectorSize;
219 if (scaledSizes.back() > totalNumThreads) {
220 LDBG() <<
"--Too few threads given the required vector size -> FAIL";
221 return Status::Invalid;
223 SmallVector<int64_t> inferredNumThreads =
226 LDBG() <<
"inferred numThreads: " << llvm::interleaved(inferredNumThreads);
227 LDBG() <<
"computed actualVectorSize: " << desiredVectorSize;
232 int64_t totalNumThreadsUsed =
product(inferredNumThreads);
233 LDBG() <<
"--totalNumThreadsUsed: " << totalNumThreadsUsed;
234 if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) {
235 LDBG() <<
"--Too few threads given the required vector size -> FAIL";
236 return Status::Invalid;
239 this->vectorSize = desiredVectorSize;
240 this->numThreads = inferredNumThreads;
241 if (totalNumThreadsUsed == totalNumThreads)
242 return Status::Success;
244 return Status::RequiresPredication;
250 <<
"vectorSize: " <<
vectorSize <<
", numThreads: {"
251 << llvm::interleaved(
numThreads) <<
"}, smallestBoundingTileSizes: {"
static Attribute linearId1(MLIRContext *ctx)
static SmallVector< int64_t > getFactors(int64_t val)
Get the list of all factors that divide val, not just the prime factors.
static SmallVector< int64_t > maximizeNumThreads(ArrayRef< int64_t > sizes, int64_t currentIndex, int64_t maxNumThreads)
Extract result from sizes with the following constraints:
static int64_t product(ArrayRef< int64_t > vals)
static Attribute linearId0(MLIRContext *ctx)
static Attribute linearId2(MLIRContext *ctx)
Attributes are known-constant values of operations.
MLIRContext is the top-level object for a collection of MLIR operations.
Include the generated interface declarations.