32 #include "llvm/ADT/STLExtras.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/TypeSwitch.h"
35 #include "llvm/Support/Debug.h"
42 #define DEBUG_TYPE "gpu-transforms"
44 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
45 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
46 #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
49 template <
typename ThreadOrBlockIdOp>
52 LLVM_DEBUG(llvm::interleaveComma(
54 DBGS() <<
"----buildLinearId with originalBasisOfr: ");
55 llvm::dbgs() <<
"\n");
56 assert(originalBasisOfr.size() == 3 &&
"expected 3 sizes");
62 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x)
64 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y)
66 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)
68 originalBasisOfr[0], originalBasisOfr[1]};
70 rewriter, loc, tx + ty * BDX + tz * BDX * BDY, vals);
77 template <
typename ThreadOrBlockIdOp>
85 buildLinearId<ThreadOrBlockIdOp>(rewriter, loc, originalBasisOfr);
92 rewriter, loc, d0.
floorDiv(multiplicity), {linearId});
96 for (
AffineExpr e : llvm::reverse(delinearizingExprs)) {
102 LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
103 DBGS() <<
"--delinearization basis: ");
104 llvm::dbgs() <<
"\n";
105 llvm::interleaveComma(strides,
106 DBGS() <<
"--delinearization strides: ");
107 llvm::dbgs() <<
"\n";
108 llvm::interleaveComma(delinearizingExprs,
109 DBGS() <<
"--delinearization exprs: ");
110 llvm::dbgs() <<
"\n";
111 llvm::interleaveComma(ids,
DBGS() <<
"--ids: ");
112 llvm::dbgs() <<
"\n";);
134 template <
typename ThreadOrBlockIdOp>
141 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x),
142 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y),
143 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)};
148 rewriter, loc, d0.
floorDiv(multiplicity), {scaledIds[0]})
152 forallMappingSizes.begin(), forallMappingSizes.end());
153 forallMappingSizeInOriginalBasis[0] *= multiplicity;
168 namespace transform {
171 GpuIdBuilder::GpuIdBuilder(
MLIRContext *ctx,
bool useLinearMapping,
173 : mappingAttributes(), idBuilder() {
174 if (useLinearMapping) {
175 for (uint64_t d =
static_cast<uint64_t
>(MappingId::LinearDim0),
176 e = getMaxEnumValForMappingId();
180 for (uint64_t d =
static_cast<uint64_t
>(MappingId::DimX),
181 e =
static_cast<uint64_t
>(MappingId::DimZ);
191 idBuilder = useLinearMapping
192 ? commonLinearIdBuilderFn<BlockIdOp>(1)
197 bool useLinearMapping)
204 ? commonLinearIdBuilderFn<ThreadIdOp>(
211 bool useLinearMapping)
219 ? commonLinearIdBuilderFn<ThreadIdOp>(
warpSize)
227 idBuilder = useLinearMapping
228 ? commonLinearIdBuilderFn<ThreadIdOp>(1)
233 std::optional<int64_t> gridDimX,
234 std::optional<int64_t> gridDimY,
235 std::optional<int64_t> gridDimZ,
236 std::optional<int64_t> blockDimX,
237 std::optional<int64_t> blockDimY,
238 std::optional<int64_t> blockDimZ) {
241 static constexpr
int maxTotalBlockdim = 1024;
242 static constexpr
int maxBlockdimx = 1024;
243 static constexpr
int maxBlockdimy = 1024;
244 static constexpr
int maxBlockdimz = 64;
245 static constexpr
int maxTotalGriddim = 2147483647;
246 static constexpr
int maxGriddimx = 2147483647;
247 static constexpr
int maxGriddimy = 65535;
248 static constexpr
int maxGriddimz = 65535;
250 if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
252 (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
254 blockDimX.value_or(1) > maxBlockdimx ||
255 blockDimY.value_or(1) > maxBlockdimy ||
256 blockDimZ.value_or(1) > maxBlockdimz ||
257 gridDimY.value_or(1) > maxGriddimy ||
258 gridDimZ.value_or(1) > maxGriddimz ||
259 gridDimX.value_or(1) > maxGriddimx) {
260 return transformOp.emitSilenceableError()
261 <<
"Trying to launch a GPU kernel with grid_dims = ("
262 << gridDimX.value_or(1) <<
", " << gridDimY.value_or(1) <<
", "
263 << gridDimZ.value_or(1) <<
") block_dims = ("
264 << blockDimX.value_or(1) <<
", " << blockDimY.value_or(1) <<
", "
265 << blockDimZ.value_or(1) <<
"). It is larger than the limits.";
272 LaunchOp &launchOp, std::optional<int64_t> gridDimX,
273 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
274 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
275 std::optional<int64_t> blockDimZ) {
277 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
278 blockDimY, blockDimZ);
279 if (!
diag.succeeded())
287 Value gridSizeX = gridDimX.has_value() ?
createConst(gridDimX.value()) : one;
288 Value gridSizeY = gridDimY.has_value() ?
createConst(gridDimY.value()) : one;
289 Value gridSizeZ = gridDimZ.has_value() ?
createConst(gridDimZ.value()) : one;
290 Value blkSizeX = blockDimX.has_value() ?
createConst(blockDimX.value()) : one;
291 Value blkSizeY = blockDimY.has_value() ?
createConst(blockDimY.value()) : one;
292 Value blkSizeZ = blockDimZ.has_value() ?
createConst(blockDimZ.value()) : one;
293 launchOp = rewriter.
create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
294 blkSizeX, blkSizeY, blkSizeZ);
296 rewriter.
create<TerminatorOp>(loc);
303 TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
304 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
305 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
306 std::optional<int64_t> blockDimZ) {
308 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
309 blockDimY, blockDimZ);
310 if (!
diag.succeeded())
313 KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
316 auto createConstValue = [&](
int dim) {
321 if (gridDimX.has_value())
322 gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
323 if (gridDimY.has_value())
324 gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
325 if (gridDimZ.has_value())
326 gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
327 if (blockDimX.has_value())
328 gpuLaunch.getBlockSizeXMutable().assign(
329 createConstValue(blockDimX.value()));
330 if (blockDimY.has_value())
331 gpuLaunch.getBlockSizeYMutable().assign(
332 createConstValue(blockDimY.value()));
333 if (blockDimZ.has_value())
334 gpuLaunch.getBlockSizeZMutable().assign(
335 createConstValue(blockDimZ.value()));
static std::string diag(const llvm::Value &value)
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
MLIRContext * getContext() const
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void setInsertionPointAfterValue(Value val)
Sets the insertion point to the node after the specified value.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
This class represents a single result from folding an operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Location getLoc() const
Return the location of this value.
Specialization of arith.constant op that returns an integer of index type.
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Include the generated interface declarations.
OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val)
Convert int64_t to integer attributes of index type and return them as OpFoldResult.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)
SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)
Given the strides together with a linear index in the dimension space, return the vector-space offset...
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Utility class for the GPU dialect to represent triples of Values accessible through ....