33 #include "llvm/ADT/STLExtras.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/TypeSwitch.h"
36 #include "llvm/Support/Debug.h"
43 #define DEBUG_TYPE "gpu-transforms"
45 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
46 #define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n")
47 #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
50 template <
typename ThreadOrBlockIdOp>
53 LLVM_DEBUG(llvm::interleaveComma(
55 DBGS() <<
"----buildLinearId with originalBasisOfr: ");
56 llvm::dbgs() <<
"\n");
57 assert(originalBasisOfr.size() == 3 &&
"expected 3 sizes");
63 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x)
65 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y)
67 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)
69 originalBasisOfr[0], originalBasisOfr[1]};
71 rewriter, loc, tx + ty * bdx + tz * bdx * bdy, vals);
78 template <
typename ThreadOrBlockIdOp>
86 buildLinearId<ThreadOrBlockIdOp>(rewriter, loc, originalBasisOfr);
93 rewriter, loc, d0.
floorDiv(multiplicity), {linearId});
97 for (
AffineExpr e : llvm::reverse(delinearizingExprs)) {
103 LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
104 DBGS() <<
"--delinearization basis: ");
105 llvm::dbgs() <<
"\n";
106 llvm::interleaveComma(strides,
107 DBGS() <<
"--delinearization strides: ");
108 llvm::dbgs() <<
"\n";
109 llvm::interleaveComma(delinearizingExprs,
110 DBGS() <<
"--delinearization exprs: ");
111 llvm::dbgs() <<
"\n";
112 llvm::interleaveComma(ids,
DBGS() <<
"--ids: ");
113 llvm::dbgs() <<
"\n";);
136 template <
typename ThreadOrBlockIdOp>
143 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x),
144 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y),
145 rewriter.
create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)};
150 rewriter, loc, d0.
floorDiv(multiplicity), {scaledIds[0]}));
153 forallMappingSizeInOriginalBasis[0] *= multiplicity;
168 namespace transform {
171 GpuIdBuilder::GpuIdBuilder(
MLIRContext *ctx,
bool useLinearMapping,
173 : mappingAttributes(), idBuilder() {
174 if (useLinearMapping) {
175 for (uint64_t d =
static_cast<uint64_t
>(MappingId::LinearDim0),
176 e = getMaxEnumValForMappingId();
180 for (uint64_t d =
static_cast<uint64_t
>(MappingId::DimX),
181 e =
static_cast<uint64_t
>(MappingId::DimZ);
191 idBuilder = useLinearMapping
192 ? commonLinearIdBuilderFn<BlockIdOp>(1)
197 bool useLinearMapping)
204 ? commonLinearIdBuilderFn<ThreadIdOp>(
211 bool useLinearMapping)
219 ? commonLinearIdBuilderFn<ThreadIdOp>(
warpSize)
227 idBuilder = useLinearMapping
228 ? commonLinearIdBuilderFn<ThreadIdOp>(1)
233 std::optional<int64_t> gridDimX,
234 std::optional<int64_t> gridDimY,
235 std::optional<int64_t> gridDimZ,
236 std::optional<int64_t> blockDimX,
237 std::optional<int64_t> blockDimY,
238 std::optional<int64_t> blockDimZ) {
242 if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
244 (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
252 return transformOp.emitSilenceableError()
253 <<
"Trying to launch a GPU kernel with grid_dims = ("
254 << gridDimX.value_or(1) <<
", " << gridDimY.value_or(1) <<
", "
255 << gridDimZ.value_or(1) <<
") block_dims = ("
256 << blockDimX.value_or(1) <<
", " << blockDimY.value_or(1) <<
", "
257 << blockDimZ.value_or(1) <<
"). It is larger than the limits.";
264 LaunchOp &launchOp, std::optional<int64_t> gridDimX,
265 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
266 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
267 std::optional<int64_t> blockDimZ) {
269 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
270 blockDimY, blockDimZ);
271 if (!
diag.succeeded())
279 Value gridSizeX = gridDimX.has_value() ?
createConst(gridDimX.value()) : one;
280 Value gridSizeY = gridDimY.has_value() ?
createConst(gridDimY.value()) : one;
281 Value gridSizeZ = gridDimZ.has_value() ?
createConst(gridDimZ.value()) : one;
282 Value blkSizeX = blockDimX.has_value() ?
createConst(blockDimX.value()) : one;
283 Value blkSizeY = blockDimY.has_value() ?
createConst(blockDimY.value()) : one;
284 Value blkSizeZ = blockDimZ.has_value() ?
createConst(blockDimZ.value()) : one;
285 launchOp = rewriter.
create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
286 blkSizeX, blkSizeY, blkSizeZ);
288 rewriter.
create<TerminatorOp>(loc);
295 TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
296 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
297 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
298 std::optional<int64_t> blockDimZ) {
300 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
301 blockDimY, blockDimZ);
302 if (!
diag.succeeded())
305 KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
308 auto createConstValue = [&](
int dim) {
313 if (gridDimX.has_value())
314 gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
315 if (gridDimY.has_value())
316 gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
317 if (gridDimZ.has_value())
318 gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
319 if (blockDimX.has_value())
320 gpuLaunch.getBlockSizeXMutable().assign(
321 createConstValue(blockDimX.value()));
322 if (blockDimY.has_value())
323 gpuLaunch.getBlockSizeYMutable().assign(
324 createConstValue(blockDimY.value()));
325 if (blockDimZ.has_value())
326 gpuLaunch.getBlockSizeZMutable().assign(
327 createConstValue(blockDimZ.value()));
static std::string diag(const llvm::Value &value)
constexpr int kMaxGriddimz
constexpr int kMaxTotalBlockdim
constexpr int kMaxGriddimy
constexpr int kMaxBlockdimx
constexpr int kMaxBlockdimz
constexpr int kMaxGriddimx
constexpr int kMaxBlockdimy
constexpr int kMaxTotalGriddim
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
MLIRContext * getContext() const
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void setInsertionPointAfterValue(Value val)
Sets the insertion point to the node after the specified value.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
This class represents a single result from folding an operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Location getLoc() const
Return the location of this value.
Specialization of arith.constant op that returns an integer of index type.
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Include the generated interface declarations.
OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val)
Convert int64_t to integer attributes of index type and return them as OpFoldResult.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)
SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)
Given the strides together with a linear index in the dimension space, return the vector-space offset...
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Utility class for the GPU dialect to represent triples of Values accessible through ....