39 #include "llvm/ADT/STLExtras.h"
40 #include "llvm/ADT/SmallVector.h"
41 #include "llvm/ADT/TypeSwitch.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/ErrorHandling.h"
44 #include "llvm/Support/InterleavedRange.h"
45 #include <type_traits>
52 #define DEBUG_TYPE "gpu-transforms"
53 #define DEBUG_TYPE_ALIAS "gpu-transforms-alias"
55 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
56 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
57 #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
63 void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
73 llvmTypeConverter, [](AddressSpace space) ->
unsigned {
75 case AddressSpace::Global:
76 return static_cast<unsigned>(
78 case AddressSpace::Workgroup:
79 return static_cast<unsigned>(
81 case AddressSpace::Private:
84 llvm_unreachable(
"unknown address space enum value");
89 llvmTypeConverter.addConversion(
97 transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
98 transform::TypeConverterBuilderOpInterface builder) {
99 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
100 return emitOpError(
"expected LLVMTypeConverter");
104 void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
111 transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
112 transform::TypeConverterBuilderOpInterface builder) {
113 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
114 return emitOpError(
"expected LLVMTypeConverter");
118 void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
125 LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
126 verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
127 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
128 return emitOpError(
"expected LLVMTypeConverter");
140 void transform::ApplyGPUPromoteShuffleToAMDGPUPatternsOp::populatePatterns(
151 static std::optional<SmallVector<int64_t>>
157 order.push_back(index);
161 llvm::SmallDenseSet<int64_t> dims;
163 dims.insert(cast<AffineDimExpr>(expr).getPosition());
168 order.push_back(index);
174 order.push_back(index);
182 static std::optional<SmallVector<int64_t>>
184 if (
auto contract = dyn_cast<vector::ContractionOp>(op)) {
185 int64_t contractRank =
contract.getIteratorTypes().size();
186 if (contractRank < 3)
189 nativeSize.append({m, n, k});
192 if (
auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
193 int64_t writeRank = writeOp.getVectorType().getRank();
197 nativeSize.append({m, n});
200 if (
auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
203 VectorType sliceType;
205 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
208 auto vecType = cast<VectorType>(extract.getResult().getType());
209 if (sliceType && sliceType != vecType)
213 return llvm::to_vector(sliceType.getShape());
216 if (
auto vecType = dyn_cast<VectorType>(op->
getResultTypes()[0])) {
219 if (vecType.getRank() < 2)
226 VectorType sliceType;
228 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
231 auto vecType = cast<VectorType>(extract.getResult().getType());
232 if (sliceType && sliceType != vecType)
237 return llvm::to_vector(sliceType.getShape());
242 nativeSize.append({m, n});
249 void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
252 auto contract = dyn_cast<vector::ContractionOp>(op);
265 vector::populateVectorUnrollPatterns(
267 .setNativeShapeFn(nativeShapeFn)
268 .setUnrollTraversalOrderFn(unrollOrder));
285 struct MappingKind {};
286 struct BlockMappingKind : MappingKind {};
287 struct ThreadMappingKind : MappingKind {};
292 Operation *target,
const Twine &message) {
293 if (transformOp.has_value())
294 return transformOp->emitDefiniteFailure() << message;
299 template <
typename MappingKindType>
302 scf::ForallOp forallOp) {
303 if (!forallOp.getMapping().has_value()) {
305 "scf.forall op requires a mapping attribute");
308 bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),
309 llvm::IsaPred<GPUBlockMappingAttr>);
310 bool hasWarpgroupMapping = llvm::any_of(
311 forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);
312 bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),
313 llvm::IsaPred<GPUWarpMappingAttr>);
314 bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
315 llvm::IsaPred<GPUThreadMappingAttr>);
316 int64_t countMappingTypes = 0;
317 countMappingTypes += hasBlockMapping ? 1 : 0;
318 countMappingTypes += hasWarpgroupMapping ? 1 : 0;
319 countMappingTypes += hasWarpMapping ? 1 : 0;
320 countMappingTypes += hasThreadMapping ? 1 : 0;
321 if (countMappingTypes > 1) {
323 transformOp, forallOp,
324 "cannot mix different mapping types, use nesting");
326 if (std::is_same<MappingKindType, BlockMappingKind>::value &&
329 transformOp, forallOp,
330 "scf.forall op requires a mapping attribute of kind 'block'");
332 if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
333 !hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) {
335 "scf.forall op requires a mapping attribute "
336 "of kind 'thread' or 'warp'");
340 for (
Attribute map : forallOp.getMapping()->getValue()) {
341 if (seen.contains(map)) {
343 transformOp, forallOp,
344 "duplicate attribute, cannot map different loops "
345 "to the same mapping id");
351 return cast<DeviceMappingAttrInterface>(a).isLinearMapping();
353 if (llvm::any_of(forallOp.getMapping()->getValue(), isLinear) &&
354 !llvm::all_of(forallOp.getMapping()->getValue(), isLinear)) {
356 transformOp, forallOp,
357 "cannot mix linear and non-linear mapping modes");
363 template <
typename MappingKindType>
366 scf::ForallOp forallOp) {
369 checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);
374 if (!forallOp.isNormalized())
376 "unsupported non-normalized loops");
377 if (forallOp.getNumResults() > 0)
379 "only bufferized scf.forall can be mapped");
380 bool useLinearMapping = cast<DeviceMappingAttrInterface>(
381 forallOp.getMapping()->getValue().front())
385 int64_t maxNumMappingsSupported =
386 useLinearMapping ? (getMaxEnumValForMappingId() -
387 static_cast<uint64_t
>(MappingId::DimZ))
389 if (forallOp.getRank() > maxNumMappingsSupported) {
391 "scf.forall with rank > ")
392 << maxNumMappingsSupported
393 <<
" does not lower for the specified mapping attribute type";
395 auto numParallelIterations =
397 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
399 transformOp, forallOp,
400 "requires statically sized, normalized forall op");
412 template <
typename OpTy,
typename OperationOrBlock>
415 OperationOrBlock *parent,
Value replacement,
417 parent->walk([&](OpTy idOp) {
418 if (availableMappingSizes[
static_cast<int64_t
>(idOp.getDimension())] == 1)
424 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
427 LDBG(
"--start rewriteOneForallCommonImpl");
430 auto numParallelIterations =
432 assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
433 "requires statically sized, normalized forall op");
436 forallMappingAttrs.insert_range(forallOp.getMapping()->getValue());
438 return cast<DeviceMappingAttrInterface>(a).getMappingId() <
439 cast<DeviceMappingAttrInterface>(b).getMappingId();
445 DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(
446 *llvm::max_element(forallMappingAttrs, comparator));
447 DeviceMappingAttrInterface maxLinearMapping;
448 if (maxMapping.isLinearMapping())
449 maxLinearMapping = maxMapping;
452 if (maxLinearMapping && comparator(maxLinearMapping, attr))
455 if (!forallMappingAttrs.insert(attr))
458 tmpMappingSizes.push_back(1);
460 LDBG(
"----tmpMappingSizes extracted from scf.forall op: "
461 << llvm::interleaved(tmpMappingSizes));
465 forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
466 LDBG(
"----forallMappingSizes: " << llvm::interleaved(forallMappingSizes));
467 LDBG(
"----forallMappingAttrs: " << llvm::interleaved(forallMappingAttrs));
474 bool originalBasisWasProvided = !originalBasis.empty();
475 if (!originalBasisWasProvided) {
476 originalBasis = forallMappingSizes;
477 while (originalBasis.size() < 3)
478 originalBasis.push_back(1);
482 gpuIdBuilder.
idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
488 for (
auto [iv, dim] : llvm::zip_equal(
489 forallOp.getInductionVars(),
490 forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
491 auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
492 Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
500 if (originalBasisWasProvided) {
505 LDBG(
"----activeMappingSizes: " << llvm::interleaved(activeMappingSizes));
506 LDBG(
"----availableMappingSizes: "
507 << llvm::interleaved(availableMappingSizes));
508 LDBG(
"----activeIdOps: " << llvm::interleaved(activeIdOps));
509 for (
auto [activeId, activeMappingSize, availableMappingSize] :
510 llvm::zip_equal(activeIdOps, activeMappingSizes,
511 availableMappingSizes)) {
512 if (activeMappingSize > availableMappingSize) {
514 transformOp, forallOp,
515 "Trying to map to fewer GPU threads than loop iterations but "
516 "overprovisioning is not yet supported. "
517 "Try additional tiling of the before mapping or map to more "
520 if (activeMappingSize == availableMappingSize)
523 rewriter.
create<arith::ConstantIndexOp>(loc, activeMappingSize);
524 Value tmpPredicate = rewriter.
create<arith::CmpIOp>(
525 loc, arith::CmpIPredicate::ult, activeId, idx);
526 LDBG(
"----predicate: " << tmpPredicate);
527 predicate = predicate ? rewriter.
create<arith::AndIOp>(loc, predicate,
535 rewriter.
eraseOp(forallOp.getTerminator());
540 auto ifOp = rewriter.
create<scf::IfOp>(loc, predicate,
542 targetBlock = ifOp.thenBlock();
543 insertionPoint = ifOp.thenBlock()->
begin();
547 targetBlock = forallOp->getBlock();
550 Block &sourceBlock = forallOp.getRegion().
front();
555 for (
Value loopIndex : forallOp.getInductionVars()) {
563 LDBG(
"----result forallMappingSizes: "
564 << llvm::interleaved(forallMappingSizes));
565 LDBG(
"----result mappingIdOps: " << llvm::interleaved(mappingIdOps));
576 RewriterBase &rewriter, TransformOpInterface transformOp,
579 LDBG(
"Start mapForallToBlocksImpl");
586 verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);
587 if (!
diag.succeeded())
592 Block *parentBlock = forallOp->getBlock();
604 rewriter, transformOp, forallOp,
605 gridDims, rewriteResult, gpuIdBuilder);
609 if (!
diag.succeeded())
613 if (gridDims.empty()) {
615 while (gridDims.size() < 3)
616 gridDims.push_back(1);
618 assert(gridDims.size() == 3 &&
"Need 3-D gridDims");
622 replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
630 scf::ForallOp &topLevelForallOp,
631 TransformOpInterface transformOp) {
632 auto walkResult = target->
walk([&](scf::ForallOp forallOp) {
633 if (forallOp->getParentOfType<scf::ForallOp>())
635 if (topLevelForallOp)
638 topLevelForallOp = forallOp;
642 if (walkResult.wasInterrupted() || !topLevelForallOp)
643 return transformOp.emitSilenceableError()
644 <<
"could not find a unique topLevel scf.forall";
651 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
652 auto transformOp = cast<TransformOpInterface>(getOperation());
654 if (!getGenerateGpuLaunch() && !gpuLaunch) {
656 emitSilenceableError()
657 <<
"Given target is not gpu.launch, set `generate_gpu_launch` "
659 diag.attachNote(target->
getLoc()) <<
"when applied to this payload op";
663 scf::ForallOp topLevelForallOp;
665 target, topLevelForallOp, transformOp);
666 if (!
diag.succeeded()) {
667 diag.attachNote(target->
getLoc()) <<
"when applied to this payload op";
670 assert(topLevelForallOp &&
"expect an scf.forall");
673 if (!getGenerateGpuLaunch() && gridDims.size() != 3)
674 return transformOp.emitDefiniteFailure(
"transform require size-3 mapping");
680 if (getGenerateGpuLaunch()) {
683 if (!
diag.succeeded())
688 rewriter.
eraseOp(topLevelForallOp);
689 topLevelForallOp = cast<scf::ForallOp>(newForallOp);
693 bool useLinearMapping =
false;
694 if (topLevelForallOp.getMapping()) {
695 auto mappingAttr = cast<DeviceMappingAttrInterface>(
696 topLevelForallOp.getMapping()->getValue().front());
697 useLinearMapping = mappingAttr.isLinearMapping();
702 rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
703 if (!
diag.succeeded())
709 cast<TransformOpInterface>(getOperation()), gridDims[0],
710 gridDims[1], gridDims[2]);
717 if (!getGridDims().empty() && getGridDims().size() != 3) {
718 return emitOpError() <<
"transform requires empty or size-3 grid_dims";
728 std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
730 int factor,
bool useLinearMapping =
false) {
731 if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
733 transformOp, forallOp,
734 Twine(
"3-D mapping: size of threadIdx.x must be a multiple of ") +
741 transformOp, forallOp,
742 Twine(
"the number of required parallel resources (blocks or "
745 " overflows the number of available resources " +
756 auto mappingAttr = cast<DeviceMappingAttrInterface>(
757 forallOp.getMapping()->getValue().front());
758 bool useLinearMapping = mappingAttr.isLinearMapping();
761 auto numParallelIterations =
763 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
765 transformOp, forallOp,
766 "requires statically sized, normalized forall op");
769 if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
770 factor = GpuWarpgroupIdBuilder::kNumWarpsPerGroup * warpSize;
771 }
else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
776 blockSizes, factor, useLinearMapping);
777 if (!
diag.succeeded())
784 .Case([&](GPUWarpgroupMappingAttr) {
787 .Case([&](GPUWarpMappingAttr) {
790 .Case([&](GPUThreadMappingAttr) {
793 .Default([&](DeviceMappingAttrInterface) ->
GpuIdBuilder {
794 llvm_unreachable(
"unknown mapping attribute");
800 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
802 bool syncAfterDistribute) {
809 verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);
810 if (!
diag.succeeded())
818 transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
819 if (!
diag.succeeded())
829 rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
830 if (!
diag.succeeded())
833 if (syncAfterDistribute)
834 rewriter.
create<BarrierOp>(loc);
840 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
842 bool syncAfterDistribute) {
843 LDBG(
"Start mapNestedForallToThreadsImpl");
844 if (blockDims.size() != 3) {
846 "requires size-3 thread mapping");
853 WalkResult walkResult = target->
walk([&](scf::ForallOp forallOp) {
855 rewriter, transformOp, forallOp, blockDims, warpSize,
856 syncAfterDistribute);
857 if (
diag.isDefiniteFailure())
859 if (
diag.succeeded())
868 replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,
877 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
878 auto transformOp = cast<TransformOpInterface>(getOperation());
882 return emitSilenceableError() <<
"Given target is not a gpu.launch";
887 checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
888 blockDims[0], blockDims[1], blockDims[2]);
889 if (
diag.isSilenceableFailure()) {
890 diag.attachNote(getLoc()) << getBlockDimsAttrName() <<
" is too large";
897 std::nullopt, std::nullopt, blockDims[0], blockDims[1],
903 getWarpSize(), getSyncAfterDistribute());
905 results.
push_back(gpuLaunch.getOperation());
916 class GPUTransformDialectExtension
918 GPUTransformDialectExtension> {
922 GPUTransformDialectExtension() {
923 declareGeneratedDialect<GPUDialect>();
924 declareGeneratedDialect<amdgpu::AMDGPUDialect>();
925 declareGeneratedDialect<arith::ArithDialect>();
926 declareGeneratedDialect<scf::SCFDialect>();
927 registerTransformOps<
929 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
935 #define GET_OP_CLASSES
936 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
static constexpr int64_t kSharedMemorySpace
static MLIRContext * getContext(OpFoldResult val)
static std::string diag(const llvm::Value &value)
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Base type for affine expression.
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
OpListType::iterator iterator
OpListType & getOperations()
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
bool succeeded() const
Returns true if this is a success.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
void addExtensions()
Add the given extensions to the registry.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Conversion from types to the LLVM IR dialect.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Operation is the basic unit of execution within MLIR.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Location getLoc()
The source location the operation was defined or derived from.
result_type_range getResultTypes()
user_range getUsers()
Returns a range of all users.
unsigned getNumResults()
Return the number of results held by this operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
Specialization of arith.constant op that returns an integer of index type.
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
@ kGlobalMemorySpace
Global memory space identifier.
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
void registerTransformDialectExtension(DialectRegistry ®istry)
uint64_t getN(LevelType lt)
uint64_t getM(LevelType lt)
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Include the generated interface declarations.
LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
const FrozenRewritePatternSet & patterns
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
std::optional< SmallVector< int64_t > > getConstantIntValues(ArrayRef< OpFoldResult > ofrs)
If all ofrs are constant integers or IntegerAttrs, return the integers.
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
LogicalResult verify(Operation *op, bool verifyRecursively=true)
Perform (potentially expensive) checks of invariants, used to detect compiler bugs,...
SmallVector< Value > getValuesSortedByKey(ArrayRef< Attribute > keys, ArrayRef< Value > values, llvm::function_ref< bool(Attribute, Attribute)> compare)
Helper to sort values according to matching keys.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns)
Erase barriers that do not enforce conflicting memory side effects.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Struct to return the result of the rewrite of a forall operation.
SmallVector< Value > mappingIds
SmallVector< int64_t > mappingSizes
Options that control the vector unrolling.