32 #include "llvm/ADT/STLExtras.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/TypeSwitch.h"
35 #include "llvm/Support/Debug.h"
42 #define DEBUG_TYPE "gpu-transforms"
44 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
45 #define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n")
46 #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
49 template <
typename ThreadOrBlockIdOp>
52 LLVM_DEBUG(llvm::interleaveComma(
54 DBGS() <<
"----buildLinearId with originalBasisOfr: ");
55 llvm::dbgs() <<
"\n");
56 assert(originalBasisOfr.size() == 3 &&
"expected 3 sizes");
62 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x)
64 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y)
66 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)
68 originalBasisOfr[0], originalBasisOfr[1]};
70 rewriter, loc, tx + ty * bdx + tz * bdx * bdy, vals);
77 template <
typename ThreadOrBlockIdOp>
85 buildLinearId<ThreadOrBlockIdOp>(rewriter, loc, originalBasisOfr);
92 rewriter, loc, d0.
floorDiv(multiplicity), {linearId});
96 for (
AffineExpr e : llvm::reverse(delinearizingExprs)) {
102 LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
103 DBGS() <<
"--delinearization basis: ");
104 llvm::dbgs() <<
"\n";
105 llvm::interleaveComma(strides,
106 DBGS() <<
"--delinearization strides: ");
107 llvm::dbgs() <<
"\n";
108 llvm::interleaveComma(delinearizingExprs,
109 DBGS() <<
"--delinearization exprs: ");
110 llvm::dbgs() <<
"\n";
111 llvm::interleaveComma(ids,
DBGS() <<
"--ids: ");
112 llvm::dbgs() <<
"\n";);
134 template <
typename ThreadOrBlockIdOp>
141 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x),
142 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y),
143 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)};
148 rewriter, loc, d0.
floorDiv(multiplicity), {scaledIds[0]})
152 forallMappingSizeInOriginalBasis[0] *= multiplicity;
167 namespace transform {
170 GpuIdBuilder::GpuIdBuilder(
MLIRContext *ctx,
bool useLinearMapping,
172 : mappingAttributes(), idBuilder() {
173 if (useLinearMapping) {
174 for (uint64_t d =
static_cast<uint64_t
>(MappingId::LinearDim0),
175 e = getMaxEnumValForMappingId();
179 for (uint64_t d =
static_cast<uint64_t
>(MappingId::DimX),
180 e =
static_cast<uint64_t
>(MappingId::DimZ);
190 idBuilder = useLinearMapping
191 ? commonLinearIdBuilderFn<BlockIdOp>(1)
196 bool useLinearMapping)
203 ? commonLinearIdBuilderFn<ThreadIdOp>(
210 bool useLinearMapping)
218 ? commonLinearIdBuilderFn<ThreadIdOp>(
warpSize)
226 idBuilder = useLinearMapping
227 ? commonLinearIdBuilderFn<ThreadIdOp>(1)
232 std::optional<int64_t> gridDimX,
233 std::optional<int64_t> gridDimY,
234 std::optional<int64_t> gridDimZ,
235 std::optional<int64_t> blockDimX,
236 std::optional<int64_t> blockDimY,
237 std::optional<int64_t> blockDimZ) {
240 static constexpr
int maxTotalBlockdim = 1024;
241 static constexpr
int maxBlockdimx = 1024;
242 static constexpr
int maxBlockdimy = 1024;
243 static constexpr
int maxBlockdimz = 64;
244 static constexpr
int maxTotalGriddim = 2147483647;
245 static constexpr
int maxGriddimx = 2147483647;
246 static constexpr
int maxGriddimy = 65535;
247 static constexpr
int maxGriddimz = 65535;
249 if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
251 (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
253 blockDimX.value_or(1) > maxBlockdimx ||
254 blockDimY.value_or(1) > maxBlockdimy ||
255 blockDimZ.value_or(1) > maxBlockdimz ||
256 gridDimY.value_or(1) > maxGriddimy ||
257 gridDimZ.value_or(1) > maxGriddimz ||
258 gridDimX.value_or(1) > maxGriddimx) {
259 return transformOp.emitSilenceableError()
260 <<
"Trying to launch a GPU kernel with grid_dims = ("
261 << gridDimX.value_or(1) <<
", " << gridDimY.value_or(1) <<
", "
262 << gridDimZ.value_or(1) <<
") block_dims = ("
263 << blockDimX.value_or(1) <<
", " << blockDimY.value_or(1) <<
", "
264 << blockDimZ.value_or(1) <<
"). It is larger than the limits.";
271 LaunchOp &launchOp, std::optional<int64_t> gridDimX,
272 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
273 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
274 std::optional<int64_t> blockDimZ) {
276 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
277 blockDimY, blockDimZ);
278 if (!
diag.succeeded())
286 Value gridSizeX = gridDimX.has_value() ?
createConst(gridDimX.value()) : one;
287 Value gridSizeY = gridDimY.has_value() ?
createConst(gridDimY.value()) : one;
288 Value gridSizeZ = gridDimZ.has_value() ?
createConst(gridDimZ.value()) : one;
289 Value blkSizeX = blockDimX.has_value() ?
createConst(blockDimX.value()) : one;
290 Value blkSizeY = blockDimY.has_value() ?
createConst(blockDimY.value()) : one;
291 Value blkSizeZ = blockDimZ.has_value() ?
createConst(blockDimZ.value()) : one;
292 launchOp = rewriter.
create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
293 blkSizeX, blkSizeY, blkSizeZ);
295 rewriter.
create<TerminatorOp>(loc);
302 TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
303 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
304 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
305 std::optional<int64_t> blockDimZ) {
307 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
308 blockDimY, blockDimZ);
309 if (!
diag.succeeded())
312 KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
315 auto createConstValue = [&](
int dim) {
320 if (gridDimX.has_value())
321 gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
322 if (gridDimY.has_value())
323 gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
324 if (gridDimZ.has_value())
325 gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
326 if (blockDimX.has_value())
327 gpuLaunch.getBlockSizeXMutable().assign(
328 createConstValue(blockDimX.value()));
329 if (blockDimY.has_value())
330 gpuLaunch.getBlockSizeYMutable().assign(
331 createConstValue(blockDimY.value()));
332 if (blockDimZ.has_value())
333 gpuLaunch.getBlockSizeZMutable().assign(
334 createConstValue(blockDimZ.value()));
static std::string diag(const llvm::Value &value)
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
MLIRContext * getContext() const
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void setInsertionPointAfterValue(Value val)
Sets the insertion point to the node after the specified value.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
This class represents a single result from folding an operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Location getLoc() const
Return the location of this value.
Specialization of arith.constant op that returns an integer of index type.
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Include the generated interface declarations.
OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val)
Convert int64_t to integer attributes of index type and return them as OpFoldResult.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)
SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)
Given the strides together with a linear index in the dimension space, return the vector-space offset...
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Utility class for the GPU dialect to represent triples of Values accessible through ....