28 #include "llvm/ADT/STLExtras.h"
29 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/Support/DebugLog.h"
31 #include "llvm/Support/InterleavedRange.h"
38 #define DEBUG_TYPE "gpu-transforms"
49 static FailureOr<SmallVector<Value>>
53 std::string &errorMsg) {
54 LDBG() <<
"----activeMappingSizes: " << llvm::interleaved(activeMappingSizes);
55 LDBG() <<
"----availableMappingSizes: "
56 << llvm::interleaved(availableMappingSizes);
59 for (
auto [activeId, activeMappingSize, availableMappingSize] :
60 llvm::zip_equal(activeIds, activeMappingSizes, availableMappingSizes)) {
61 if (activeMappingSize > availableMappingSize) {
62 errorMsg =
"Trying to map to fewer GPU threads than loop iterations but "
63 "overprovisioning is not yet supported. Try additional tiling "
64 "before mapping or map to more threads.";
67 if (activeMappingSize == availableMappingSize)
71 Value pred = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::ult,
73 predicateOps.push_back(pred);
79 template <
typename ThreadOrBlockIdOp>
82 LDBG() <<
"----buildLinearId with originalBasisOfr: "
83 << llvm::interleaved(originalBasisOfr);
84 assert(originalBasisOfr.size() == 3 &&
"expected 3 sizes");
90 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::x)
92 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::y)
94 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::z)
96 originalBasisOfr[0], originalBasisOfr[1]};
98 rewriter, loc, tx + ty * bdx + tz * bdx * bdy, vals);
105 template <
typename ThreadOrBlockIdOp>
108 DeviceMaskingAttrInterface mask =
nullptr) {
115 mask.getMaxNumPhysicalIds() * multiplicity) {
118 "mask representation too short to capture all physical ids: ") +
119 std::to_string(mask.getMaxNumPhysicalIds()),
128 Value physicalLinearId =
129 buildLinearId<ThreadOrBlockIdOp>(rewriter, loc, originalBasisOfr);
134 rewriter, loc, d0.
floorDiv(multiplicity), {physicalLinearId});
137 Value scaledLinearIdI64;
138 Value scaledLinearId =
143 scaledLinearIdI64 = arith::IndexCastUIOp::create(
144 rewriter, loc, rewriter.getI64Type(), scaledLinearId);
145 Value logicalLinearIdI64 =
146 mask.createLogicalLinearMappingId(rewriter, scaledLinearIdI64);
147 scaledLinearId = arith::IndexCastUIOp::create(
148 rewriter, loc, rewriter.getIndexType(), logicalLinearIdI64);
149 LDBG() <<
"------adjusting linearId with mask: " << scaledLinearId;
160 for (
AffineExpr e : llvm::reverse(delinearizingExprs)) {
165 std::string errorMsg;
169 Value isActiveIdPredicate =
170 mask.createIsActiveIdPredicate(rewriter, scaledLinearIdI64);
171 LDBG() <<
"------adjusting predicate with mask: " << isActiveIdPredicate;
172 predicateOps.push_back(isActiveIdPredicate);
175 FailureOr<SmallVector<Value>> maybePredicateOps =
179 if (succeeded(maybePredicateOps))
180 predicateOps = *maybePredicateOps;
194 template <
typename ThreadOrBlockIdOp>
201 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::x),
202 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::y),
203 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::z)};
208 rewriter, loc, d0.
floorDiv(multiplicity), {scaledIds[0]}));
211 forallMappingSizeInOriginalBasis[0] *= multiplicity;
213 std::string errorMsg;
215 FailureOr<SmallVector<Value>> maybePredicateOps =
217 originalBasis, errorMsg);
218 if (succeeded(maybePredicateOps))
219 predicateOps = *maybePredicateOps;
238 Value physicalLinearId =
239 buildLinearId<ThreadIdOp>(rewriter, loc, originalBasisOfr);
244 rewriter, loc, d0 % warpSize, {physicalLinearId});
254 for (
AffineExpr e : llvm::reverse(delinearizingExprs)) {
260 std::string errorMsg;
263 rewriter, loc, cast<Value>(laneId),
computeProduct(forallMappingSizes),
265 if (succeeded(maybePredicateOps))
266 predicateOps = *maybePredicateOps;
277 namespace transform {
280 GpuIdBuilder::GpuIdBuilder(
MLIRContext *ctx,
bool useLinearMapping,
282 : mappingAttributes(), idBuilder() {
283 if (useLinearMapping) {
284 for (uint64_t d =
static_cast<uint64_t
>(MappingId::LinearDim0),
285 e = getMaxEnumValForMappingId();
289 for (uint64_t d =
static_cast<uint64_t
>(MappingId::DimX),
290 e =
static_cast<uint64_t
>(MappingId::DimZ);
297 DeviceMaskingAttrInterface mask)
301 assert((!mask || useLinearMapping) &&
"mask requires linear mapping");
302 idBuilder = useLinearMapping
303 ? commonLinearIdBuilderFn<BlockIdOp>(1, mask)
308 bool useLinearMapping,
309 DeviceMaskingAttrInterface mask)
315 assert((!mask || useLinearMapping) &&
"mask requires linear mapping");
317 ? commonLinearIdBuilderFn<ThreadIdOp>(
324 bool useLinearMapping,
325 DeviceMaskingAttrInterface mask)
331 assert((!mask || useLinearMapping) &&
"mask requires linear mapping");
333 ? commonLinearIdBuilderFn<ThreadIdOp>(
339 DeviceMaskingAttrInterface mask)
345 ? commonLinearIdBuilderFn<ThreadIdOp>(1, mask)
350 bool unused, DeviceMaskingAttrInterface mask)
356 assert(!mask &&
"mask NYI for lanes, unclear it should be at all");
361 std::optional<int64_t> gridDimX,
362 std::optional<int64_t> gridDimY,
363 std::optional<int64_t> gridDimZ,
364 std::optional<int64_t> blockDimX,
365 std::optional<int64_t> blockDimY,
366 std::optional<int64_t> blockDimZ) {
370 if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
372 (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
380 return transformOp.emitSilenceableError()
381 <<
"Trying to launch a GPU kernel with grid_dims = ("
382 << gridDimX.value_or(1) <<
", " << gridDimY.value_or(1) <<
", "
383 << gridDimZ.value_or(1) <<
") block_dims = ("
384 << blockDimX.value_or(1) <<
", " << blockDimY.value_or(1) <<
", "
385 << blockDimZ.value_or(1) <<
"). It is larger than the limits.";
392 LaunchOp &launchOp, std::optional<int64_t> gridDimX,
393 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
394 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
395 std::optional<int64_t> blockDimZ) {
397 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
398 blockDimY, blockDimZ);
399 if (!
diag.succeeded())
407 Value gridSizeX = gridDimX.has_value() ?
createConst(gridDimX.value()) : one;
408 Value gridSizeY = gridDimY.has_value() ?
createConst(gridDimY.value()) : one;
409 Value gridSizeZ = gridDimZ.has_value() ?
createConst(gridDimZ.value()) : one;
410 Value blkSizeX = blockDimX.has_value() ?
createConst(blockDimX.value()) : one;
411 Value blkSizeY = blockDimY.has_value() ?
createConst(blockDimY.value()) : one;
412 Value blkSizeZ = blockDimZ.has_value() ?
createConst(blockDimZ.value()) : one;
413 launchOp = LaunchOp::create(rewriter, loc, gridSizeX, gridSizeY, gridSizeZ,
414 blkSizeX, blkSizeY, blkSizeZ);
416 TerminatorOp::create(rewriter, loc);
423 TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
424 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
425 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
426 std::optional<int64_t> blockDimZ) {
428 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
429 blockDimY, blockDimZ);
430 if (!
diag.succeeded())
433 KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
436 auto createConstValue = [&](
int dim) {
441 if (gridDimX.has_value())
442 gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
443 if (gridDimY.has_value())
444 gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
445 if (gridDimZ.has_value())
446 gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
447 if (blockDimX.has_value())
448 gpuLaunch.getBlockSizeXMutable().assign(
449 createConstValue(blockDimX.value()));
450 if (blockDimY.has_value())
451 gpuLaunch.getBlockSizeYMutable().assign(
452 createConstValue(blockDimY.value()));
453 if (blockDimZ.has_value())
454 gpuLaunch.getBlockSizeZMutable().assign(
455 createConstValue(blockDimZ.value()));
static std::string diag(const llvm::Value &value)
constexpr int kMaxGriddimz
constexpr int kMaxTotalBlockdim
constexpr int kMaxGriddimy
constexpr int kMaxBlockdimx
constexpr int kMaxBlockdimz
constexpr int kMaxGriddimx
constexpr int kMaxBlockdimy
constexpr int kMaxTotalGriddim
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
MLIRContext * getContext() const
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void setInsertionPointAfterValue(Value val)
Sets the insertion point to the node after the specified value.
This class represents a single result from folding an operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Location getLoc() const
Return the location of this value.
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Include the generated interface declarations.
OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val)
Convert int64_t to integer attributes of index type and return them as OpFoldResult.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)
SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)
Given the strides together with a linear index in the dimension space, return the vector-space offset...
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Utility class for the GPU dialect to represent triples of Values accessible through ....