doxygen/GPUHeuristics_8h_source.html

 //===- GPUHeuristics.h - GPU heuristics for Linalg transforms ---*- C++ -*-===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//


 #ifndef MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H

 #define MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H


 #include "mlir/IR/Attributes.h"

 #include "mlir/IR/MLIRContext.h"


 namespace mlir {

 namespace transform {

 namespace gpu {


 /// Base struct to hold GPU mapping information for a given operation.

 struct MappingInfo {

   /// Number of threads to use for the mapping.

   /// Note: When the number of threads used is smaller than the total number of

   /// available threads, predication ensues. It is often useful to use more

   /// threads and saturate memory bandwidth for some operations, even if others

   /// end up being predicated.

   SmallVector<int64_t> numThreads;


   /// Thread mapping attributes, one per entry of `numThreads`.

   SmallVector<Attribute> threadMapping;

 };


 struct CopyMappingInfo : public MappingInfo {

   /// Status of the mapping computation, invalid usually means too many threads

   /// are required and we fail to map. This usually happens when the copy is too

   /// large compared to the number of threads.

   enum class Status { Success = 0, RequiresPredication, Invalid };


   /// Greedily compute the MappingInfo to use to perform a copy of `sizes`

   /// elements of bitwidth `elementalBitwidth`.

   /// The `desiredBitAlignment` is the number of elements by which the most

   /// minor dimension of the copy is expected to be aligned.

   /// This is an approximation of the final alignment, for each row of the copy.

   /// This is used to restrict the size of copied vector so that they match

   /// potential subsequent cp.async.

   /// If the alignment does not match the required alignment for a cp.async down

   /// the line, the conversion to cp.async will be eventually skipped, possibly

   /// degrading performance.

   /// When `favorPredication` is false, the mapping is computed to fill all

   /// threads with an equal amount of data to copy, so as to avoid predication.

   /// Predication ends up requiring a split epilogue in current pipelining

   /// implementations and is better avoided when possible.

   CopyMappingInfo(MLIRContext *ctx, int totalNumThreads,

                   int64_t desiredBitAlignment, ArrayRef<int64_t> sizes,

                   bool favorPredication = false,

                   int64_t elementalBitwidth = 32);


 private:

   /// Determine the maximal vector size to use to copy a contiguous array of

   /// `numContiguousElements`, each of bitwidth `elementalBitwidth`.

   /// The `alignment` is the number of elements by which the most minor

   /// dimension of the copy is aligned. This is an approximation of actual

   /// memory alignment after bufferization, for each row of the copy. This is

   /// used to restrict the of the copied vector so that it is properly aligned

   /// with the requirements of cp.async. If the copy alignment does not match

   /// the required aligned for a cp.async, thae conversion to cp.async will be

   /// skipped.

   /// Asserts that `elementalBitwidth` divides `numContiguousElements`.

   static int64_t

   maxContiguousElementsToTransfer(int64_t alignment,

                                   int64_t numContiguousElements,

                                   int64_t elementalBitwidth = 32);


   /// Compute the number of threads to use to perform a copy of `sizes`

   /// elements of `elementalBitwidth`.

   /// The `alignment` is the number of elements by which the most minor

   /// dimension of the copy is aligned. This is an approximation of actual

   /// memory alignment after bufferization, for each row of the copy. This is

   /// used to restrict the of the copied vector so that it is properly aligned

   /// with the requirements of cp.async. If the copy alignment does not match

   /// the required aligned for a cp.async, the conversion to cp.async will be

   /// skipped.

   /// When `favorPredication` is false, the implementation avoids predication

   /// in the copy, even if it means reducing the granularity of the transfer.

   /// Otherwise, the implementation will come up with a maximal assignment of

   /// the remaining threads to sizes of interest, using a DP implementation.

   Status inferNumThreads(int64_t totalNumThreads, ArrayRef<int64_t> sizes,

                          int64_t desiredVectorSize, bool favorPredication);

   Status inferNumThreadsImpl(int64_t totalNumThreads, ArrayRef<int64_t> sizes,

                              int64_t desiredVectorSize);


 public:

   // Pretty-printing and diagnostic methods.

   void print(llvm::raw_ostream &os) const;

   LLVM_DUMP_METHOD void dump() const;


   /// Static quantity determining the number of bits to target in an individual

   /// copy. Assumes that smaller increments of 64, 32, 16, 8 are also valid

   /// transfer sizes. In the future we should have more hardware pluggability

   /// here, especially when we want sub-byte granularity

   static constexpr int64_t kMaxVectorLoadBitWidth = 128;


   /// Most minor vector size (i.e. 1-D), in number of elements, used in a copy.

   int64_t vectorSize;


   /// Number of threads to use for the copy mapping, from most major to most

   /// minor dims (i.e. numThreads.back() should be mapped to contiguous threads

   /// for best coalescing).

   using MappingInfo::numThreads;


   /// Explicit computation / injection of the smallest bounding tile sizes after

   /// mapping to `numThreads`. This is useful in masked scenarios.

   SmallVector<int64_t> smallestBoundingTileSizes;


   /// Thread mapping attributes, one per entry of `numThreads`.

   using MappingInfo::threadMapping;


   /// The status of a particular copy mapping. Must be checked before applying

   /// transformations.

   Status status;

 };


 } // namespace gpu

 } // namespace transform

 } // namespace mlir


 #endif // MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H

Attributes.h

MLIRContext.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::SmallVector
Definition: LLVM.h:72

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::transform::gpu::CopyMappingInfo
Definition: GPUHeuristics.h:32

mlir::transform::gpu::CopyMappingInfo::Status
Status
Status of the mapping computation, invalid usually means too many threads are required and we fail to...
Definition: GPUHeuristics.h:36

mlir::transform::gpu::CopyMappingInfo::Status::Invalid
@ Invalid

mlir::transform::gpu::CopyMappingInfo::Status::Success
@ Success

mlir::transform::gpu::CopyMappingInfo::Status::RequiresPredication
@ RequiresPredication

mlir::transform::gpu::CopyMappingInfo::vectorSize
int64_t vectorSize
Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
Definition: GPUHeuristics.h:103

mlir::transform::gpu::CopyMappingInfo::status
Status status
The status of a particular copy mapping.
Definition: GPUHeuristics.h:119

mlir::transform::gpu::CopyMappingInfo::dump
LLVM_DUMP_METHOD void dump() const

mlir::transform::gpu::CopyMappingInfo::CopyMappingInfo
CopyMappingInfo(MLIRContext *ctx, int totalNumThreads, int64_t desiredBitAlignment, ArrayRef< int64_t > sizes, bool favorPredication=false, int64_t elementalBitwidth=32)
Greedily compute the MappingInfo to use to perform a copy of sizes elements of bitwidth elementalBitw...
Definition: GPUHeuristics.cpp:38

mlir::transform::gpu::CopyMappingInfo::kMaxVectorLoadBitWidth
static constexpr int64_t kMaxVectorLoadBitWidth
Static quantity determining the number of bits to target in an individual copy.
Definition: GPUHeuristics.h:100

mlir::transform::gpu::CopyMappingInfo::print
void print(llvm::raw_ostream &os) const
Definition: GPUHeuristics.cpp:250

mlir::transform::gpu::CopyMappingInfo::smallestBoundingTileSizes
SmallVector< int64_t > smallestBoundingTileSizes
Explicit computation / injection of the smallest bounding tile sizes after mapping to numThreads.
Definition: GPUHeuristics.h:112

mlir::transform::gpu::MappingInfo
Base struct to hold GPU mapping information for a given operation.
Definition: GPUHeuristics.h:20

mlir::transform::gpu::MappingInfo::threadMapping
SmallVector< Attribute > threadMapping
Thread mapping attributes, one per entry of numThreads.
Definition: GPUHeuristics.h:29

mlir::transform::gpu::MappingInfo::numThreads
SmallVector< int64_t > numThreads
Number of threads to use for the mapping.
Definition: GPUHeuristics.h:26