doxygen/GPUHeuristics_8cpp_source.html

 //===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"


 #include "mlir/Dialect/GPU/IR/GPUDialect.h"

 #include "llvm/ADT/ArrayRef.h"

 #include "llvm/ADT/STLExtras.h"

 #include "llvm/Support/CommandLine.h"

 #include "llvm/Support/Debug.h"

 #include "llvm/Support/InterleavedRange.h"

 #include "llvm/Support/MathExtras.h"

 #include "llvm/Support/raw_ostream.h"

 #include <cmath>

 #include <numeric>


 using namespace mlir;


 #define DEBUG_TYPE "linalg-transforms"

 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")


 static Attribute linearId0(MLIRContext *ctx) {

   return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);

 }

 static Attribute linearId1(MLIRContext *ctx) {

   return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);

 }

 static Attribute linearId2(MLIRContext *ctx) {

   return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);

 }


 transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,

                                                  int totalNumThreads,

                                                  int64_t desiredBitAlignment,

                                                  ArrayRef<int64_t> copySizes,

                                                  bool favorPredication,

                                                  int64_t elementalBitwidth) {

   assert(!copySizes.empty() && copySizes.size() <= 3 &&

          "only 1,2,3-D copies are supported for now");


   LDBG("START CopyMappingInfo, favorPredication: " << favorPredication);

   LLVM_DEBUG(DBGS() << "--copy shape: " << llvm::interleaved(copySizes)

                     << "\n");


   // Greedily find the largest vector size that can be used to copy the most

   // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth

   // contiguous memory transactions with as few threads as possible.

   int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(

       desiredBitAlignment, copySizes.back(), elementalBitwidth);


   LDBG("--greedily determined vectorSize: "

        << desiredVectorSize << " elements of " << elementalBitwidth

        << "b each -> " << (desiredVectorSize * elementalBitwidth)

        << "b total out of a max of " << kMaxVectorLoadBitWidth << "b");


   status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize,

                            favorPredication);

   if (status == Status::Invalid)

     return;


   LLVM_DEBUG(DBGS() << "--copy: " << llvm::interleaved(copySizes) << "\n"

                     << "--numThreads: " << llvm::interleaved(this->numThreads)

                     << "\n"

                     << "--vectorSize: " << this->vectorSize << "\n");

   assert(this->numThreads.size() == copySizes.size() &&

          "compute copy mapping expected same number of threads and copy sizes");


   // Compute the smallest bounding box.

   this->smallestBoundingTileSizes = llvm::to_vector(

       llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) {

         int64_t size, numThreads;

         std::tie(size, numThreads) = pair;

         return llvm::divideCeilSigned(size, numThreads);

       }));

   SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),

                                            linearId0(ctx)};


   // Set the thread mapping.

   this->threadMapping =

       llvm::to_vector(ArrayRef(allThreadMappings)

                           .take_back(this->smallestBoundingTileSizes.size()));

   LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");

 }


 int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(

     int64_t desiredBitAlignment, int64_t numContiguousElements,

     int64_t elementalBitwidth) {

   assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&

          "elemental bitwidth does not divide kMaxVectorLoadBitWidth");

   assert(desiredBitAlignment % elementalBitwidth == 0 &&

          "elemental bitwidth does not divide desired bit alignment");

   return std::gcd(

       std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements),

       kMaxVectorLoadBitWidth / elementalBitwidth);

 }


 /// Get the list of all factors that divide `val`, not just the prime factors.

 static SmallVector<int64_t> getFactors(int64_t val) {

   SmallVector<int64_t> factors;

   factors.reserve(val);

   for (int64_t factor = 1; factor <= val; ++factor) {

     if (val % factor != 0)

       continue;

     factors.push_back(factor);

   }

   factors.push_back(val);

   return factors;

 }


 static int64_t product(ArrayRef<int64_t> vals) {

   int64_t res = 1;

   for (auto val : vals)

     res *= val;

   return res;

 }


 /// Extract `result` from `sizes` with the following constraints:

 ///   1. sizes[i] % result[i] for all i

 ///   2. product_of_threadsPerDim <= maxNumThreads

 ///   3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]

 ///      must be sizes[currentIndex].

 /// This is used to greedily extract the maximum number of threads usable for

 /// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and

 /// ensuring coalesced access along the most minor dimension.

 /// Return the number of threads used in the range:

 ///   threadsPerDim[currentIndex .. sizes.end()]

 // The implementation uses a dynamic programming approach to greedily extract

 // the best combination under the constraints.

 // TODO: Implementation details can be improved but putting effort there is a

 // tradeoffs: `sizes` is expected to be of small rank and contain small values.

 static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes,

                                                int64_t currentIndex,

                                                int64_t maxNumThreads) {

   assert(static_cast<size_t>(currentIndex) < sizes.size() &&

          "currentIndex out of bounds");

   std::string indent(2 * currentIndex, '-');

   if (static_cast<size_t>(currentIndex) == sizes.size() - 1) {

     LDBG(indent << "mandated globalBest: " << sizes[currentIndex]);

     return SmallVector<int64_t>{sizes[currentIndex]};

   }


   int64_t best = 0;

   int64_t s = sizes[currentIndex];

   SmallVector<int64_t> factors = getFactors(s);

   SmallVector<int64_t> localThreadsPerDim;

   localThreadsPerDim.reserve(sizes.size());

   LDBG(indent << "maximizeNumThreads in " << s

               << " with limit: " << maxNumThreads);

   for (auto factor : factors) {

     auto nestedThreadsPerDim =

         maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor);

     int64_t localBest = factor * product(nestedThreadsPerDim);

     if (localBest > best && localBest <= maxNumThreads) {

       LDBG(indent << "new localBest: " << localBest);

       LDBG(indent << "nestedThreadsPerDim: "

                   << llvm::interleaved(nestedThreadsPerDim));

       localThreadsPerDim.clear();

       localThreadsPerDim.push_back(factor);

       llvm::append_range(localThreadsPerDim, nestedThreadsPerDim);

       best = localBest;

     }

   }


   LDBG(indent << "found globalBest: " << best);

   LDBG(indent << "numThreads: " << llvm::interleaved(localThreadsPerDim));

   return localThreadsPerDim;

 }


 transform::gpu::CopyMappingInfo::Status

 transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,

                                                  ArrayRef<int64_t> sizes,

                                                  int64_t desiredVectorSize,

                                                  bool favorPredication) {


   if (!favorPredication) {

     int64_t localVectorSize = desiredVectorSize;

     for (; localVectorSize >= 1; localVectorSize /= 2) {

       // Attempt to map the copy with predication and current fixed vector size:

       //   1. if the status is Success, we are done.

       //   2. if the status is Invalid, we fail immediately, no amount of

       //   vector size reduction can offset the bad tile size selection from the

       //   higher-level.

       //   3. if the status is RequiresPredication, we try again with a smaller

       //   vector size.

       Status status =

           inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize);

       if (status == Status::Success || status == Status::Invalid)

         return status;


       LDBG("requires predication, try reducing vector size to "

            << (localVectorSize / 2));

     }

   }


   // If we have not yet returned, it means that we have tried all vector sizes

   // and we still require predication. Restart from the original vector size and

   // do not attempt to

   return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);

 }


 transform::gpu::CopyMappingInfo::Status

 transform::gpu::CopyMappingInfo::inferNumThreadsImpl(

     int64_t totalNumThreads, ArrayRef<int64_t> sizes,

     int64_t desiredVectorSize) {

   assert(sizes.back() % desiredVectorSize == 0 &&

          "most-minor size not divisible by actualVectorSize");


   LDBG("inferNumThreadsImpl with totalNumThreads: "

        << totalNumThreads << " and vectorSize: " << desiredVectorSize);


   // Scale the most minor size to account for the chosen vector size and

   // maximize the number of threads without exceeding the total number of

   // threads.

   SmallVector<int64_t> scaledSizes(sizes);

   scaledSizes.back() /= desiredVectorSize;

   if (scaledSizes.back() > totalNumThreads) {

     LDBG("--Too few threads given the required vector size -> FAIL");

     return Status::Invalid;

   }

   SmallVector<int64_t> inferredNumThreads =

       maximizeNumThreads(scaledSizes, 0, totalNumThreads);


   LDBG("inferred numThreads: " << llvm::interleaved(inferredNumThreads));

   LDBG("computed actualVectorSize: " << desiredVectorSize);


   // Corner case: we cannot use more threads than available. If the dimension of

   // the copy is so bad it is because higher-level tiling did not do its job, we

   // do not try to recover from it here.

   int64_t totalNumThreadsUsed = product(inferredNumThreads);

   LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed);

   if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) {

     LDBG("--Too few threads given the required vector size -> FAIL");

     return Status::Invalid;

   }


   this->vectorSize = desiredVectorSize;

   this->numThreads = inferredNumThreads;

   if (totalNumThreadsUsed == totalNumThreads)

     return Status::Success;


   return Status::RequiresPredication;

 }


 void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {

   os << "MappingInfo{"

      << "CopyMappingInfo: " << "valid: " << (status != Status::Invalid) << ", "

      << "vectorSize: " << vectorSize << ", numThreads: {"

      << llvm::interleaved(numThreads) << "}, smallestBoundingTileSizes: {"

      << llvm::interleaved(smallestBoundingTileSizes) << "}, threadMapping: {"

      << llvm::interleaved(threadMapping) << "}}";

 }

GPUDialect.h

maximizeNumThreads
static SmallVector< int64_t > maximizeNumThreads(ArrayRef< int64_t > sizes, int64_t currentIndex, int64_t maxNumThreads)
Extract result from sizes with the following constraints:
Definition: GPUHeuristics.cpp:137

linearId1
static Attribute linearId1(MLIRContext *ctx)
Definition: GPUHeuristics.cpp:31

product
static int64_t product(ArrayRef< int64_t > vals)
Definition: GPUHeuristics.cpp:116

getFactors
static SmallVector< int64_t > getFactors(int64_t val)
Get the list of all factors that divide val, not just the prime factors.
Definition: GPUHeuristics.cpp:104

linearId0
static Attribute linearId0(MLIRContext *ctx)
Definition: GPUHeuristics.cpp:28

linearId2
static Attribute linearId2(MLIRContext *ctx)
Definition: GPUHeuristics.cpp:34

DBGS
#define DBGS()
Definition: GPUHeuristics.cpp:25

LDBG
#define LDBG(X)
Definition: GPUHeuristics.cpp:26

GPUHeuristics.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::SmallVector
Definition: LLVM.h:72

mlir::Attribute
Attributes are known-constant values of operations.
Definition: Attributes.h:25

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::transform::gpu::CopyMappingInfo::threadMapping
SmallVector< Attribute > threadMapping
Thread mapping attributes, one per entry of numThreads.
Definition: GPUHeuristics.h:29

mlir::transform::gpu::CopyMappingInfo::Status
Status
Status of the mapping computation, invalid usually means too many threads are required and we fail to...
Definition: GPUHeuristics.h:36

mlir::transform::gpu::CopyMappingInfo::Status::Invalid
@ Invalid

mlir::transform::gpu::CopyMappingInfo::vectorSize
int64_t vectorSize
Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
Definition: GPUHeuristics.h:103

mlir::transform::gpu::CopyMappingInfo::status
Status status
The status of a particular copy mapping.
Definition: GPUHeuristics.h:119

mlir::transform::gpu::CopyMappingInfo::CopyMappingInfo
CopyMappingInfo(MLIRContext *ctx, int totalNumThreads, int64_t desiredBitAlignment, ArrayRef< int64_t > sizes, bool favorPredication=false, int64_t elementalBitwidth=32)
Greedily compute the MappingInfo to use to perform a copy of sizes elements of bitwidth elementalBitw...
Definition: GPUHeuristics.cpp:38

mlir::transform::gpu::CopyMappingInfo::kMaxVectorLoadBitWidth
static constexpr int64_t kMaxVectorLoadBitWidth
Static quantity determining the number of bits to target in an individual copy.
Definition: GPUHeuristics.h:100

mlir::transform::gpu::CopyMappingInfo::print
void print(llvm::raw_ostream &os) const
Definition: GPUHeuristics.cpp:250

mlir::transform::gpu::CopyMappingInfo::smallestBoundingTileSizes
SmallVector< int64_t > smallestBoundingTileSizes
Explicit computation / injection of the smallest bounding tile sizes after mapping to numThreads.
Definition: GPUHeuristics.h:112

mlir::transform::gpu::CopyMappingInfo::numThreads
SmallVector< int64_t > numThreads
Number of threads to use for the copy mapping, from most major to most minor dims (i....
Definition: GPUHeuristics.h:26