MLIR
22.0.0git
include
mlir
Dialect
Linalg
TransformOps
GPUHeuristics.h
Go to the documentation of this file.
1
//===- GPUHeuristics.h - GPU heuristics for Linalg transforms ---*- C++ -*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#ifndef MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
10
#define MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
11
12
#include "
mlir/IR/Attributes.h
"
13
#include "
mlir/IR/MLIRContext.h
"
14
15
namespace
mlir
{
16
namespace
transform
{
17
namespace
gpu
{
18
19
/// Base struct to hold GPU mapping information for a given operation.
20
struct
MappingInfo
{
21
/// Number of threads to use for the mapping.
22
/// Note: When the number of threads used is smaller than the total number of
23
/// available threads, predication ensues. It is often useful to use more
24
/// threads and saturate memory bandwidth for some operations, even if others
25
/// end up being predicated.
26
SmallVector<int64_t>
numThreads
;
27
28
/// Thread mapping attributes, one per entry of `numThreads`.
29
SmallVector<Attribute>
threadMapping
;
30
};
31
32
struct
CopyMappingInfo
:
public
MappingInfo
{
33
/// Status of the mapping computation, invalid usually means too many threads
34
/// are required and we fail to map. This usually happens when the copy is too
35
/// large compared to the number of threads.
36
enum class
Status
{
Success
= 0,
RequiresPredication
,
Invalid
};
37
38
/// Greedily compute the MappingInfo to use to perform a copy of `sizes`
39
/// elements of bitwidth `elementalBitwidth`.
40
/// The `desiredBitAlignment` is the number of elements by which the most
41
/// minor dimension of the copy is expected to be aligned.
42
/// This is an approximation of the final alignment, for each row of the copy.
43
/// This is used to restrict the size of copied vector so that they match
44
/// potential subsequent cp.async.
45
/// If the alignment does not match the required alignment for a cp.async down
46
/// the line, the conversion to cp.async will be eventually skipped, possibly
47
/// degrading performance.
48
/// When `favorPredication` is false, the mapping is computed to fill all
49
/// threads with an equal amount of data to copy, so as to avoid predication.
50
/// Predication ends up requiring a split epilogue in current pipelining
51
/// implementations and is better avoided when possible.
52
CopyMappingInfo
(
MLIRContext
*ctx,
int
totalNumThreads,
53
int64_t
desiredBitAlignment,
ArrayRef<int64_t>
sizes,
54
bool
favorPredication =
false
,
55
int64_t
elementalBitwidth = 32);
56
57
private
:
58
/// Determine the maximal vector size to use to copy a contiguous array of
59
/// `numContiguousElements`, each of bitwidth `elementalBitwidth`.
60
/// The `alignment` is the number of elements by which the most minor
61
/// dimension of the copy is aligned. This is an approximation of actual
62
/// memory alignment after bufferization, for each row of the copy. This is
63
/// used to restrict the of the copied vector so that it is properly aligned
64
/// with the requirements of cp.async. If the copy alignment does not match
65
/// the required aligned for a cp.async, thae conversion to cp.async will be
66
/// skipped.
67
/// Asserts that `elementalBitwidth` divides `numContiguousElements`.
68
static
int64_t
69
maxContiguousElementsToTransfer(
int64_t
alignment,
70
int64_t
numContiguousElements,
71
int64_t
elementalBitwidth = 32);
72
73
/// Compute the number of threads to use to perform a copy of `sizes`
74
/// elements of `elementalBitwidth`.
75
/// The `alignment` is the number of elements by which the most minor
76
/// dimension of the copy is aligned. This is an approximation of actual
77
/// memory alignment after bufferization, for each row of the copy. This is
78
/// used to restrict the of the copied vector so that it is properly aligned
79
/// with the requirements of cp.async. If the copy alignment does not match
80
/// the required aligned for a cp.async, the conversion to cp.async will be
81
/// skipped.
82
/// When `favorPredication` is false, the implementation avoids predication
83
/// in the copy, even if it means reducing the granularity of the transfer.
84
/// Otherwise, the implementation will come up with a maximal assignment of
85
/// the remaining threads to sizes of interest, using a DP implementation.
86
Status inferNumThreads(
int64_t
totalNumThreads,
ArrayRef<int64_t>
sizes,
87
int64_t
desiredVectorSize,
bool
favorPredication);
88
Status inferNumThreadsImpl(
int64_t
totalNumThreads,
ArrayRef<int64_t>
sizes,
89
int64_t
desiredVectorSize);
90
91
public
:
92
// Pretty-printing and diagnostic methods.
93
void
print
(llvm::raw_ostream &os)
const
;
94
LLVM_DUMP_METHOD
void
dump
()
const
;
95
96
/// Static quantity determining the number of bits to target in an individual
97
/// copy. Assumes that smaller increments of 64, 32, 16, 8 are also valid
98
/// transfer sizes. In the future we should have more hardware pluggability
99
/// here, especially when we want sub-byte granularity
100
static
constexpr
int64_t
kMaxVectorLoadBitWidth
= 128;
101
102
/// Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
103
int64_t
vectorSize
;
104
105
/// Number of threads to use for the copy mapping, from most major to most
106
/// minor dims (i.e. numThreads.back() should be mapped to contiguous threads
107
/// for best coalescing).
108
using
MappingInfo::numThreads
;
109
110
/// Explicit computation / injection of the smallest bounding tile sizes after
111
/// mapping to `numThreads`. This is useful in masked scenarios.
112
SmallVector<int64_t>
smallestBoundingTileSizes
;
113
114
/// Thread mapping attributes, one per entry of `numThreads`.
115
using
MappingInfo::threadMapping
;
116
117
/// The status of a particular copy mapping. Must be checked before applying
118
/// transformations.
119
Status
status
;
120
};
121
122
inline
llvm::raw_ostream &
operator<<
(llvm::raw_ostream &os,
123
const
CopyMappingInfo
&info) {
124
info.
print
(os);
125
return
os;
126
}
127
128
}
// namespace gpu
129
}
// namespace transform
130
}
// namespace mlir
131
132
#endif
// MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
Attributes.h
MLIRContext.h
int64_t
llvm::ArrayRef
Definition
LLVM.h:48
llvm::SmallVector
Definition
LLVM.h:72
mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition
MLIRContext.h:63
mlir::transform::gpu
Definition
GPUTransformOps.h:32
mlir::transform::gpu::operator<<
raw_ostream & operator<<(raw_ostream &os, const IdBuilderResult &res)
Definition
Utils.h:41
mlir::transform
Definition
DLTITransformOps.h:18
mlir
Include the generated interface declarations.
Definition
AliasAnalysis.h:19
mlir::transform::gpu::CopyMappingInfo
Definition
GPUHeuristics.h:32
mlir::transform::gpu::CopyMappingInfo::Status
Status
Status of the mapping computation, invalid usually means too many threads are required and we fail to...
Definition
GPUHeuristics.h:36
mlir::transform::gpu::CopyMappingInfo::Status::Invalid
@ Invalid
Definition
GPUHeuristics.h:36
mlir::transform::gpu::CopyMappingInfo::Status::Success
@ Success
Definition
GPUHeuristics.h:36
mlir::transform::gpu::CopyMappingInfo::Status::RequiresPredication
@ RequiresPredication
Definition
GPUHeuristics.h:36
mlir::transform::gpu::CopyMappingInfo::vectorSize
int64_t vectorSize
Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
Definition
GPUHeuristics.h:103
mlir::transform::gpu::CopyMappingInfo::status
Status status
The status of a particular copy mapping.
Definition
GPUHeuristics.h:119
mlir::transform::gpu::CopyMappingInfo::dump
LLVM_DUMP_METHOD void dump() const
mlir::transform::gpu::CopyMappingInfo::CopyMappingInfo
CopyMappingInfo(MLIRContext *ctx, int totalNumThreads, int64_t desiredBitAlignment, ArrayRef< int64_t > sizes, bool favorPredication=false, int64_t elementalBitwidth=32)
Greedily compute the MappingInfo to use to perform a copy of sizes elements of bitwidth elementalBitw...
Definition
GPUHeuristics.cpp:36
mlir::transform::gpu::CopyMappingInfo::kMaxVectorLoadBitWidth
static constexpr int64_t kMaxVectorLoadBitWidth
Static quantity determining the number of bits to target in an individual copy.
Definition
GPUHeuristics.h:100
mlir::transform::gpu::CopyMappingInfo::print
void print(llvm::raw_ostream &os) const
Definition
GPUHeuristics.cpp:246
mlir::transform::gpu::CopyMappingInfo::smallestBoundingTileSizes
SmallVector< int64_t > smallestBoundingTileSizes
Explicit computation / injection of the smallest bounding tile sizes after mapping to numThreads.
Definition
GPUHeuristics.h:112
mlir::transform::gpu::MappingInfo
Base struct to hold GPU mapping information for a given operation.
Definition
GPUHeuristics.h:20
mlir::transform::gpu::MappingInfo::threadMapping
SmallVector< Attribute > threadMapping
Thread mapping attributes, one per entry of numThreads.
Definition
GPUHeuristics.h:29
mlir::transform::gpu::MappingInfo::numThreads
SmallVector< int64_t > numThreads
Number of threads to use for the mapping.
Definition
GPUHeuristics.h:26
Generated on
for MLIR by
1.14.0