38 #include "llvm/ADT/STLExtras.h"
39 #include "llvm/ADT/SmallVector.h"
40 #include "llvm/ADT/TypeSwitch.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/InterleavedRange.h"
44 #include <type_traits>
51 #define DEBUG_TYPE "gpu-transforms"
52 #define DEBUG_TYPE_ALIAS "gpu-transforms-alias"
54 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
55 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
56 #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
62 void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
72 llvmTypeConverter, [](AddressSpace space) ->
unsigned {
74 case AddressSpace::Global:
75 return static_cast<unsigned>(
77 case AddressSpace::Workgroup:
78 return static_cast<unsigned>(
80 case AddressSpace::Private:
83 llvm_unreachable(
"unknown address space enum value");
88 llvmTypeConverter.addConversion(
96 transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
97 transform::TypeConverterBuilderOpInterface builder) {
98 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
99 return emitOpError(
"expected LLVMTypeConverter");
103 void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
110 transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
111 transform::TypeConverterBuilderOpInterface builder) {
112 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
113 return emitOpError(
"expected LLVMTypeConverter");
117 void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
124 LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
125 verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
126 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
127 return emitOpError(
"expected LLVMTypeConverter");
145 static std::optional<SmallVector<int64_t>>
151 order.push_back(index);
155 llvm::SmallDenseSet<int64_t> dims;
157 dims.insert(cast<AffineDimExpr>(expr).getPosition());
162 order.push_back(index);
168 order.push_back(index);
176 static std::optional<SmallVector<int64_t>>
178 if (
auto contract = dyn_cast<vector::ContractionOp>(op)) {
179 int64_t contractRank =
contract.getIteratorTypes().size();
180 if (contractRank < 3)
183 nativeSize.append({m, n, k});
186 if (
auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
187 int64_t writeRank = writeOp.getVectorType().getRank();
191 nativeSize.append({m, n});
194 if (
auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
197 VectorType sliceType;
199 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
202 auto vecType = cast<VectorType>(extract.getResult().getType());
203 if (sliceType && sliceType != vecType)
207 return llvm::to_vector(sliceType.getShape());
210 if (
auto vecType = dyn_cast<VectorType>(op->
getResultTypes()[0])) {
213 if (vecType.getRank() < 2)
220 VectorType sliceType;
222 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
225 auto vecType = cast<VectorType>(extract.getResult().getType());
226 if (sliceType && sliceType != vecType)
231 return llvm::to_vector(sliceType.getShape());
236 nativeSize.append({m, n});
243 void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
246 auto contract = dyn_cast<vector::ContractionOp>(op);
259 vector::populateVectorUnrollPatterns(
261 .setNativeShapeFn(nativeShapeFn)
262 .setUnrollTraversalOrderFn(unrollOrder));
279 struct MappingKind {};
280 struct BlockMappingKind : MappingKind {};
281 struct ThreadMappingKind : MappingKind {};
286 Operation *target,
const Twine &message) {
287 if (transformOp.has_value())
288 return transformOp->emitDefiniteFailure() << message;
293 template <
typename MappingKindType>
296 scf::ForallOp forallOp) {
297 if (!forallOp.getMapping().has_value()) {
299 "scf.forall op requires a mapping attribute");
302 bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),
303 llvm::IsaPred<GPUBlockMappingAttr>);
304 bool hasWarpgroupMapping = llvm::any_of(
305 forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);
306 bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),
307 llvm::IsaPred<GPUWarpMappingAttr>);
308 bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
309 llvm::IsaPred<GPUThreadMappingAttr>);
310 int64_t countMappingTypes = 0;
311 countMappingTypes += hasBlockMapping ? 1 : 0;
312 countMappingTypes += hasWarpgroupMapping ? 1 : 0;
313 countMappingTypes += hasWarpMapping ? 1 : 0;
314 countMappingTypes += hasThreadMapping ? 1 : 0;
315 if (countMappingTypes > 1) {
317 transformOp, forallOp,
318 "cannot mix different mapping types, use nesting");
320 if (std::is_same<MappingKindType, BlockMappingKind>::value &&
323 transformOp, forallOp,
324 "scf.forall op requires a mapping attribute of kind 'block'");
326 if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
327 !hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) {
329 "scf.forall op requires a mapping attribute "
330 "of kind 'thread' or 'warp'");
334 for (
Attribute map : forallOp.getMapping()->getValue()) {
335 if (seen.contains(map)) {
337 transformOp, forallOp,
338 "duplicate attribute, cannot map different loops "
339 "to the same mapping id");
345 return cast<DeviceMappingAttrInterface>(a).isLinearMapping();
347 if (llvm::any_of(forallOp.getMapping()->getValue(), isLinear) &&
348 !llvm::all_of(forallOp.getMapping()->getValue(), isLinear)) {
350 transformOp, forallOp,
351 "cannot mix linear and non-linear mapping modes");
357 template <
typename MappingKindType>
360 scf::ForallOp forallOp) {
363 checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);
368 if (!forallOp.isNormalized())
370 "unsupported non-normalized loops");
371 if (forallOp.getNumResults() > 0)
373 "only bufferized scf.forall can be mapped");
374 bool useLinearMapping = cast<DeviceMappingAttrInterface>(
375 forallOp.getMapping()->getValue().front())
379 int64_t maxNumMappingsSupported =
380 useLinearMapping ? (getMaxEnumValForMappingId() -
381 static_cast<uint64_t
>(MappingId::DimZ))
383 if (forallOp.getRank() > maxNumMappingsSupported) {
385 "scf.forall with rank > ")
386 << maxNumMappingsSupported
387 <<
" does not lower for the specified mapping attribute type";
389 auto numParallelIterations =
391 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
393 transformOp, forallOp,
394 "requires statically sized, normalized forall op");
406 template <
typename OpTy,
typename OperationOrBlock>
409 OperationOrBlock *parent,
Value replacement,
411 parent->walk([&](OpTy idOp) {
412 if (availableMappingSizes[
static_cast<int64_t
>(idOp.getDimension())] == 1)
418 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
421 LDBG(
"--start rewriteOneForallCommonImpl");
424 auto numParallelIterations =
426 assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
427 "requires statically sized, normalized forall op");
430 forallMappingAttrs.insert_range(forallOp.getMapping()->getValue());
432 return cast<DeviceMappingAttrInterface>(a).getMappingId() <
433 cast<DeviceMappingAttrInterface>(b).getMappingId();
439 DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(
440 *llvm::max_element(forallMappingAttrs, comparator));
441 DeviceMappingAttrInterface maxLinearMapping;
442 if (maxMapping.isLinearMapping())
443 maxLinearMapping = maxMapping;
446 if (maxLinearMapping && comparator(maxLinearMapping, attr))
449 if (!forallMappingAttrs.insert(attr))
452 tmpMappingSizes.push_back(1);
454 LDBG(
"----tmpMappingSizes extracted from scf.forall op: "
455 << llvm::interleaved(tmpMappingSizes));
459 forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
460 LDBG(
"----forallMappingSizes: " << llvm::interleaved(forallMappingSizes));
461 LDBG(
"----forallMappingAttrs: " << llvm::interleaved(forallMappingAttrs));
468 bool originalBasisWasProvided = !originalBasis.empty();
469 if (!originalBasisWasProvided) {
470 originalBasis = forallMappingSizes;
471 while (originalBasis.size() < 3)
472 originalBasis.push_back(1);
476 gpuIdBuilder.
idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
482 for (
auto [iv, dim] : llvm::zip_equal(
483 forallOp.getInductionVars(),
484 forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
485 auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
486 Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
494 if (originalBasisWasProvided) {
499 LDBG(
"----activeMappingSizes: " << llvm::interleaved(activeMappingSizes));
500 LDBG(
"----availableMappingSizes: "
501 << llvm::interleaved(availableMappingSizes));
502 LDBG(
"----activeIdOps: " << llvm::interleaved(activeIdOps));
503 for (
auto [activeId, activeMappingSize, availableMappingSize] :
504 llvm::zip_equal(activeIdOps, activeMappingSizes,
505 availableMappingSizes)) {
506 if (activeMappingSize > availableMappingSize) {
508 transformOp, forallOp,
509 "Trying to map to fewer GPU threads than loop iterations but "
510 "overprovisioning is not yet supported. "
511 "Try additional tiling of the before mapping or map to more "
514 if (activeMappingSize == availableMappingSize)
517 rewriter.
create<arith::ConstantIndexOp>(loc, activeMappingSize);
518 Value tmpPredicate = rewriter.
create<arith::CmpIOp>(
519 loc, arith::CmpIPredicate::ult, activeId, idx);
520 LDBG(
"----predicate: " << tmpPredicate);
521 predicate = predicate ? rewriter.
create<arith::AndIOp>(loc, predicate,
529 rewriter.
eraseOp(forallOp.getTerminator());
534 auto ifOp = rewriter.
create<scf::IfOp>(loc, predicate,
536 targetBlock = ifOp.thenBlock();
537 insertionPoint = ifOp.thenBlock()->
begin();
541 targetBlock = forallOp->getBlock();
544 Block &sourceBlock = forallOp.getRegion().
front();
549 for (
Value loopIndex : forallOp.getInductionVars()) {
557 LDBG(
"----result forallMappingSizes: "
558 << llvm::interleaved(forallMappingSizes));
559 LDBG(
"----result mappingIdOps: " << llvm::interleaved(mappingIdOps));
570 RewriterBase &rewriter, TransformOpInterface transformOp,
573 LDBG(
"Start mapForallToBlocksImpl");
580 verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);
581 if (!
diag.succeeded())
586 Block *parentBlock = forallOp->getBlock();
598 rewriter, transformOp, forallOp,
599 gridDims, rewriteResult, gpuIdBuilder);
603 if (!
diag.succeeded())
607 if (gridDims.empty()) {
609 while (gridDims.size() < 3)
610 gridDims.push_back(1);
612 assert(gridDims.size() == 3 &&
"Need 3-D gridDims");
616 replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
624 scf::ForallOp &topLevelForallOp,
625 TransformOpInterface transformOp) {
626 auto walkResult = target->
walk([&](scf::ForallOp forallOp) {
627 if (forallOp->getParentOfType<scf::ForallOp>())
629 if (topLevelForallOp)
632 topLevelForallOp = forallOp;
636 if (walkResult.wasInterrupted() || !topLevelForallOp)
637 return transformOp.emitSilenceableError()
638 <<
"could not find a unique topLevel scf.forall";
645 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
646 auto transformOp = cast<TransformOpInterface>(getOperation());
648 if (!getGenerateGpuLaunch() && !gpuLaunch) {
650 emitSilenceableError()
651 <<
"Given target is not gpu.launch, set `generate_gpu_launch` "
653 diag.attachNote(target->
getLoc()) <<
"when applied to this payload op";
657 scf::ForallOp topLevelForallOp;
659 target, topLevelForallOp, transformOp);
660 if (!
diag.succeeded()) {
661 diag.attachNote(target->
getLoc()) <<
"when applied to this payload op";
664 assert(topLevelForallOp &&
"expect an scf.forall");
667 if (!getGenerateGpuLaunch() && gridDims.size() != 3)
668 return transformOp.emitDefiniteFailure(
"transform require size-3 mapping");
674 if (getGenerateGpuLaunch()) {
677 if (!
diag.succeeded())
682 rewriter.
eraseOp(topLevelForallOp);
683 topLevelForallOp = cast<scf::ForallOp>(newForallOp);
687 bool useLinearMapping =
false;
688 if (topLevelForallOp.getMapping()) {
689 auto mappingAttr = cast<DeviceMappingAttrInterface>(
690 topLevelForallOp.getMapping()->getValue().front());
691 useLinearMapping = mappingAttr.isLinearMapping();
696 rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
697 if (!
diag.succeeded())
703 cast<TransformOpInterface>(getOperation()), gridDims[0],
704 gridDims[1], gridDims[2]);
711 if (!getGridDims().empty() && getGridDims().size() != 3) {
712 return emitOpError() <<
"transform requires empty or size-3 grid_dims";
722 std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
724 int factor,
bool useLinearMapping =
false) {
725 if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
727 transformOp, forallOp,
728 Twine(
"3-D mapping: size of threadIdx.x must be a multiple of ") +
735 transformOp, forallOp,
736 Twine(
"the number of required parallel resources (blocks or "
739 " overflows the number of available resources " +
750 auto mappingAttr = cast<DeviceMappingAttrInterface>(
751 forallOp.getMapping()->getValue().front());
752 bool useLinearMapping = mappingAttr.isLinearMapping();
755 auto numParallelIterations =
757 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
759 transformOp, forallOp,
760 "requires statically sized, normalized forall op");
763 if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
764 factor = GpuWarpgroupIdBuilder::kNumWarpsPerGroup * warpSize;
765 }
else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
770 blockSizes, factor, useLinearMapping);
771 if (!
diag.succeeded())
778 .Case([&](GPUWarpgroupMappingAttr) {
781 .Case([&](GPUWarpMappingAttr) {
784 .Case([&](GPUThreadMappingAttr) {
787 .Default([&](DeviceMappingAttrInterface) ->
GpuIdBuilder {
788 llvm_unreachable(
"unknown mapping attribute");
794 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
796 bool syncAfterDistribute) {
803 verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);
804 if (!
diag.succeeded())
812 transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
813 if (!
diag.succeeded())
823 rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
824 if (!
diag.succeeded())
827 if (syncAfterDistribute)
828 rewriter.
create<BarrierOp>(loc);
834 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
836 bool syncAfterDistribute) {
837 LDBG(
"Start mapNestedForallToThreadsImpl");
838 if (blockDims.size() != 3) {
840 "requires size-3 thread mapping");
847 WalkResult walkResult = target->
walk([&](scf::ForallOp forallOp) {
849 rewriter, transformOp, forallOp, blockDims, warpSize,
850 syncAfterDistribute);
851 if (
diag.isDefiniteFailure())
853 if (
diag.succeeded())
862 replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,
871 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
872 auto transformOp = cast<TransformOpInterface>(getOperation());
876 return emitSilenceableError() <<
"Given target is not a gpu.launch";
881 checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
882 blockDims[0], blockDims[1], blockDims[2]);
883 if (
diag.isSilenceableFailure()) {
884 diag.attachNote(getLoc()) << getBlockDimsAttrName() <<
" is too large";
891 std::nullopt, std::nullopt, blockDims[0], blockDims[1],
897 getWarpSize(), getSyncAfterDistribute());
899 results.
push_back(gpuLaunch.getOperation());
910 class GPUTransformDialectExtension
912 GPUTransformDialectExtension> {
916 GPUTransformDialectExtension() {
917 declareGeneratedDialect<scf::SCFDialect>();
918 declareGeneratedDialect<arith::ArithDialect>();
919 declareGeneratedDialect<GPUDialect>();
920 registerTransformOps<
922 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
928 #define GET_OP_CLASSES
929 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
static constexpr int64_t kSharedMemorySpace
static MLIRContext * getContext(OpFoldResult val)
static std::string diag(const llvm::Value &value)
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Base type for affine expression.
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
OpListType::iterator iterator
OpListType & getOperations()
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
bool succeeded() const
Returns true if this is a success.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
void addExtensions()
Add the given extensions to the registry.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Conversion from types to the LLVM IR dialect.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Operation is the basic unit of execution within MLIR.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Location getLoc()
The source location the operation was defined or derived from.
result_type_range getResultTypes()
user_range getUsers()
Returns a range of all users.
unsigned getNumResults()
Return the number of results held by this operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
Specialization of arith.constant op that returns an integer of index type.
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
@ kGlobalMemorySpace
Global memory space identifier.
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
void registerTransformDialectExtension(DialectRegistry ®istry)
uint64_t getN(LevelType lt)
uint64_t getM(LevelType lt)
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Include the generated interface declarations.
LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
const FrozenRewritePatternSet & patterns
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
std::optional< SmallVector< int64_t > > getConstantIntValues(ArrayRef< OpFoldResult > ofrs)
If all ofrs are constant integers or IntegerAttrs, return the integers.
LogicalResult verify(Operation *op, bool verifyRecursively=true)
Perform (potentially expensive) checks of invariants, used to detect compiler bugs,...
SmallVector< Value > getValuesSortedByKey(ArrayRef< Attribute > keys, ArrayRef< Value > values, llvm::function_ref< bool(Attribute, Attribute)> compare)
Helper to sort values according to matching keys.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns)
Erase barriers that do not enforce conflicting memory side effects.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Struct to return the result of the rewrite of a forall operation.
SmallVector< Value > mappingIds
SmallVector< int64_t > mappingSizes
Options that control the vector unrolling.