33 #include "llvm/ADT/STLExtras.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/TypeSwitch.h"
36 #include "llvm/Support/Debug.h"
37 #include "llvm/Support/InterleavedRange.h"
44 #define DEBUG_TYPE "gpu-transforms"
46 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
47 #define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n")
48 #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
51 template <
typename ThreadOrBlockIdOp>
54 LLVM_DEBUG(
DBGS() <<
"----buildLinearId with originalBasisOfr: "
55 << llvm::interleaved(originalBasisOfr) <<
"\n");
56 assert(originalBasisOfr.size() == 3 &&
"expected 3 sizes");
62 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x)
64 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y)
66 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)
68 originalBasisOfr[0], originalBasisOfr[1]};
70 rewriter, loc, tx + ty * bdx + tz * bdx * bdy, vals);
77 template <
typename ThreadOrBlockIdOp>
85 buildLinearId<ThreadOrBlockIdOp>(rewriter, loc, originalBasisOfr);
92 rewriter, loc, d0.
floorDiv(multiplicity), {linearId});
96 for (
AffineExpr e : llvm::reverse(delinearizingExprs)) {
101 LLVM_DEBUG(
DBGS() <<
"--delinearization basis: "
102 << llvm::interleaved(reverseBasisSizes) <<
"\n";
103 DBGS() <<
"--delinearization strides: "
104 << llvm::interleaved(strides) <<
"\n";
105 DBGS() <<
"--delinearization exprs: "
106 << llvm::interleaved(delinearizingExprs) <<
"\n";
107 DBGS() <<
"--ids: " << llvm::interleaved(ids) <<
"\n");
128 template <
typename ThreadOrBlockIdOp>
135 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x),
136 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y),
137 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)};
142 rewriter, loc, d0.
floorDiv(multiplicity), {scaledIds[0]}));
145 forallMappingSizeInOriginalBasis[0] *= multiplicity;
160 namespace transform {
163 GpuIdBuilder::GpuIdBuilder(
MLIRContext *ctx,
bool useLinearMapping,
165 : mappingAttributes(), idBuilder() {
166 if (useLinearMapping) {
167 for (uint64_t d =
static_cast<uint64_t
>(MappingId::LinearDim0),
168 e = getMaxEnumValForMappingId();
172 for (uint64_t d =
static_cast<uint64_t
>(MappingId::DimX),
173 e =
static_cast<uint64_t
>(MappingId::DimZ);
183 idBuilder = useLinearMapping
184 ? commonLinearIdBuilderFn<BlockIdOp>(1)
189 bool useLinearMapping)
196 ? commonLinearIdBuilderFn<ThreadIdOp>(
203 bool useLinearMapping)
211 ? commonLinearIdBuilderFn<ThreadIdOp>(
warpSize)
219 idBuilder = useLinearMapping
220 ? commonLinearIdBuilderFn<ThreadIdOp>(1)
225 std::optional<int64_t> gridDimX,
226 std::optional<int64_t> gridDimY,
227 std::optional<int64_t> gridDimZ,
228 std::optional<int64_t> blockDimX,
229 std::optional<int64_t> blockDimY,
230 std::optional<int64_t> blockDimZ) {
234 if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
236 (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
244 return transformOp.emitSilenceableError()
245 <<
"Trying to launch a GPU kernel with grid_dims = ("
246 << gridDimX.value_or(1) <<
", " << gridDimY.value_or(1) <<
", "
247 << gridDimZ.value_or(1) <<
") block_dims = ("
248 << blockDimX.value_or(1) <<
", " << blockDimY.value_or(1) <<
", "
249 << blockDimZ.value_or(1) <<
"). It is larger than the limits.";
256 LaunchOp &launchOp, std::optional<int64_t> gridDimX,
257 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
258 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
259 std::optional<int64_t> blockDimZ) {
261 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
262 blockDimY, blockDimZ);
263 if (!
diag.succeeded())
271 Value gridSizeX = gridDimX.has_value() ?
createConst(gridDimX.value()) : one;
272 Value gridSizeY = gridDimY.has_value() ?
createConst(gridDimY.value()) : one;
273 Value gridSizeZ = gridDimZ.has_value() ?
createConst(gridDimZ.value()) : one;
274 Value blkSizeX = blockDimX.has_value() ?
createConst(blockDimX.value()) : one;
275 Value blkSizeY = blockDimY.has_value() ?
createConst(blockDimY.value()) : one;
276 Value blkSizeZ = blockDimZ.has_value() ?
createConst(blockDimZ.value()) : one;
277 launchOp = rewriter.
create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
278 blkSizeX, blkSizeY, blkSizeZ);
280 rewriter.
create<TerminatorOp>(loc);
287 TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
288 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
289 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
290 std::optional<int64_t> blockDimZ) {
292 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
293 blockDimY, blockDimZ);
294 if (!
diag.succeeded())
297 KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
300 auto createConstValue = [&](
int dim) {
305 if (gridDimX.has_value())
306 gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
307 if (gridDimY.has_value())
308 gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
309 if (gridDimZ.has_value())
310 gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
311 if (blockDimX.has_value())
312 gpuLaunch.getBlockSizeXMutable().assign(
313 createConstValue(blockDimX.value()));
314 if (blockDimY.has_value())
315 gpuLaunch.getBlockSizeYMutable().assign(
316 createConstValue(blockDimY.value()));
317 if (blockDimZ.has_value())
318 gpuLaunch.getBlockSizeZMutable().assign(
319 createConstValue(blockDimZ.value()));
static std::string diag(const llvm::Value &value)
constexpr int kMaxGriddimz
constexpr int kMaxTotalBlockdim
constexpr int kMaxGriddimy
constexpr int kMaxBlockdimx
constexpr int kMaxBlockdimz
constexpr int kMaxGriddimx
constexpr int kMaxBlockdimy
constexpr int kMaxTotalGriddim
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
MLIRContext * getContext() const
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void setInsertionPointAfterValue(Value val)
Sets the insertion point to the node after the specified value.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
This class represents a single result from folding an operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Location getLoc() const
Return the location of this value.
Specialization of arith.constant op that returns an integer of index type.
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Include the generated interface declarations.
OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val)
Convert int64_t to integer attributes of index type and return them as OpFoldResult.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)
SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)
Given the strides together with a linear index in the dimension space, return the vector-space offset...
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Utility class for the GPU dialect to represent triples of Values accessible through ....