42#include "llvm/ADT/STLExtras.h"
43#include "llvm/ADT/SmallVector.h"
44#include "llvm/ADT/TypeSwitch.h"
45#include "llvm/Support/DebugLog.h"
46#include "llvm/Support/ErrorHandling.h"
47#include "llvm/Support/InterleavedRange.h"
48#include "llvm/Support/LogicalResult.h"
57#define DEBUG_TYPE "gpu-transforms"
63void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
69 llvmTypeConverter.addConversion(
77transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
78 transform::TypeConverterBuilderOpInterface builder) {
79 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
84void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
91transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
92 transform::TypeConverterBuilderOpInterface builder) {
93 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
98void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
105LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
106 verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
107 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
112void transform::ApplyGPUToROCDLConversionPatternsOp::populatePatterns(
116 FailureOr<amdgpu::Chipset> maybeChipset =
118 assert(llvm::succeeded(maybeChipset) &&
"expected valid chipset");
124transform::ApplyGPUToROCDLConversionPatternsOp::verifyTypeConverter(
125 transform::TypeConverterBuilderOpInterface builder) {
126 FailureOr<amdgpu::Chipset> maybeChipset =
128 if (
failed(maybeChipset)) {
129 return emitOpError(
"Invalid chipset name: " + getChipset());
131 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
144void transform::ApplyGPUPromoteShuffleToAMDGPUPatternsOp::populatePatterns(
146 std::optional<StringRef> chipsetName = getChipset();
147 std::optional<amdgpu::Chipset> maybeChipset;
149 FailureOr<amdgpu::Chipset> parsedChipset =
151 assert(llvm::succeeded(parsedChipset) &&
"expected valid chipset");
152 maybeChipset = parsedChipset;
164static std::optional<SmallVector<int64_t>>
168 for (
auto [
index, iter] : llvm::enumerate(
contract.getIteratorTypes())) {
170 order.push_back(
index);
174 llvm::SmallDenseSet<int64_t> dims;
176 dims.insert(cast<AffineDimExpr>(expr).getPosition());
179 for (
auto [
index, iter] : llvm::enumerate(
contract.getIteratorTypes())) {
181 order.push_back(
index);
185 for (
auto [
index, iter] : llvm::enumerate(
contract.getIteratorTypes())) {
187 order.push_back(
index);
195static std::optional<SmallVector<int64_t>>
197 if (
auto contract = dyn_cast<vector::ContractionOp>(op)) {
199 if (contractRank < 3)
202 nativeSize.append({m, n, k});
205 if (
auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
206 int64_t writeRank = writeOp.getVectorType().getRank();
210 nativeSize.append({m, n});
213 if (
auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
216 VectorType sliceType;
218 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
221 auto vecType = cast<VectorType>(extract.getResult().getType());
222 if (sliceType && sliceType != vecType)
226 return llvm::to_vector(sliceType.getShape());
229 if (
auto vecType = dyn_cast<VectorType>(op->
getResultTypes()[0])) {
232 if (vecType.getRank() < 2)
239 VectorType sliceType;
241 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
244 auto vecType = cast<VectorType>(extract.getResult().getType());
245 if (sliceType && sliceType != vecType)
250 return llvm::to_vector(sliceType.getShape());
255 nativeSize.append({m, n});
262void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
265 auto contract = dyn_cast<vector::ContractionOp>(op);
278 vector::populateVectorUnrollPatterns(
280 .setNativeShapeFn(nativeShapeFn)
281 .setUnrollTraversalOrderFn(unrollOrder));
298struct MappingKind {};
299struct BlockMappingKind : MappingKind {};
300struct ThreadMappingKind : MappingKind {};
306 if (transformOp.has_value())
307 return transformOp->emitDefiniteFailure() << message;
312template <
typename MappingKindType>
315 scf::ForallOp forallOp) {
316 if (!forallOp.getMapping().has_value()) {
318 "scf.forall op requires a mapping attribute");
321 bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),
322 llvm::IsaPred<GPUBlockMappingAttr>);
323 bool hasWarpgroupMapping = llvm::any_of(
324 forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);
325 bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),
326 llvm::IsaPred<GPUWarpMappingAttr>);
327 bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
328 llvm::IsaPred<GPUThreadMappingAttr>);
329 bool hasLaneMapping = llvm::any_of(forallOp.getMapping().value(),
330 llvm::IsaPred<GPULaneMappingAttr>);
332 countMappingTypes += hasBlockMapping ? 1 : 0;
333 countMappingTypes += hasWarpgroupMapping ? 1 : 0;
334 countMappingTypes += hasWarpMapping ? 1 : 0;
335 countMappingTypes += hasThreadMapping ? 1 : 0;
336 countMappingTypes += hasLaneMapping ? 1 : 0;
337 if (countMappingTypes > 1) {
339 transformOp, forallOp,
340 "cannot mix different mapping types, use nesting");
342 if (std::is_same<MappingKindType, BlockMappingKind>::value &&
345 transformOp, forallOp,
346 "scf.forall op requires a mapping attribute of kind 'block'");
348 if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
349 !hasLaneMapping && !hasThreadMapping && !hasWarpMapping &&
350 !hasWarpgroupMapping) {
352 "scf.forall op requires a mapping attribute "
353 "of kind 'thread' or 'warp'");
357 for (
Attribute map : forallOp.getMapping()->getValue()) {
358 if (seen.contains(map)) {
360 transformOp, forallOp,
361 "duplicate attribute, cannot map different loops "
362 "to the same mapping id");
367 auto isLinear = [](DeviceMappingAttrInterface attr) {
368 return attr.isLinearMapping();
370 if (llvm::any_of(forallOp.getDeviceMappingAttrs(), isLinear) &&
371 !llvm::all_of(forallOp.getDeviceMappingAttrs(), isLinear)) {
373 transformOp, forallOp,
374 "cannot mix linear and non-linear mapping modes");
377 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
378 forallOp.getDeviceMaskingAttr();
379 if (succeeded(maybeMaskingAttr) && *maybeMaskingAttr &&
380 !forallOp.usesLinearMapping()) {
382 transformOp, forallOp,
383 "device masking is only available in linear mapping mode");
389template <
typename MappingKindType>
392 scf::ForallOp forallOp) {
400 if (!forallOp.isNormalized())
402 "unsupported non-normalized loops");
403 if (forallOp.getNumResults() > 0)
405 "only bufferized scf.forall can be mapped");
406 bool useLinearMapping = forallOp.usesLinearMapping();
409 int64_t maxNumMappingsSupported =
410 useLinearMapping ? (getMaxEnumValForMappingId() -
411 static_cast<uint64_t
>(MappingId::DimZ))
413 if (forallOp.getRank() > maxNumMappingsSupported) {
415 "scf.forall with rank > ")
416 << maxNumMappingsSupported
417 <<
" does not lower for the specified mapping attribute type";
419 auto numParallelIterations =
421 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
423 transformOp, forallOp,
424 "requires statically sized, normalized forall op");
436template <
typename OpTy,
typename OperationOrBlock>
441 parent->walk([&](OpTy idOp) {
442 if (availableMappingSizes[
static_cast<int64_t>(idOp.getDimension())] == 1)
448 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
451 LDBG() <<
"--start rewriteOneForallCommonImpl";
454 auto numParallelIterations =
456 assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
457 "requires statically sized, normalized forall op");
460 forallOp.getDeviceMappingAttrs();
462 forallMappingAttrs.insert_range(forallMappingAttrsVec);
464 return cast<DeviceMappingAttrInterface>(a).getMappingId() <
465 cast<DeviceMappingAttrInterface>(
b).getMappingId();
471 DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(
472 *llvm::max_element(forallMappingAttrs, comparator));
473 DeviceMappingAttrInterface maxLinearMapping;
474 if (maxMapping.isLinearMapping())
475 maxLinearMapping = maxMapping;
478 if (maxLinearMapping && comparator(maxLinearMapping, attr))
481 if (!forallMappingAttrs.insert(attr))
484 tmpMappingSizes.push_back(1);
486 LDBG() <<
"----tmpMappingSizes extracted from scf.forall op: "
487 << llvm::interleaved(tmpMappingSizes);
491 forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
492 LDBG() <<
"----forallMappingSizes: " << llvm::interleaved(forallMappingSizes);
493 LDBG() <<
"----forallMappingAttrs: " << llvm::interleaved(forallMappingAttrs);
500 bool originalBasisWasProvided = !originalBasis.empty();
501 if (!originalBasisWasProvided) {
502 LDBG() <<
"----originalBasis was not provided, deriving it and there will "
505 originalBasis = forallMappingSizes;
506 while (originalBasis.size() < 3)
507 originalBasis.push_back(1);
509 LDBG() <<
"----originalBasis was provided, using it, there will be "
512 LDBG() <<
"------originalBasis: " << llvm::interleaved(originalBasis);
515 gpuIdBuilder.
idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
516 if (!builderResult.
errorMsg.empty())
519 LDBG() << builderResult;
525 for (
auto [iv, dim] : llvm::zip_equal(
526 forallOp.getInductionVars(),
527 forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
528 auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
529 Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
530 LDBG() <<
"----map: " << iv <<
" to " << peIdOp;
538 if (originalBasisWasProvided) {
540 predicate = predicate ? arith::AndIOp::create(rewriter, loc, predicate,
548 rewriter.
eraseOp(forallOp.getTerminator());
553 auto ifOp = scf::IfOp::create(rewriter, loc, predicate,
555 targetBlock = ifOp.thenBlock();
556 insertionPoint = ifOp.thenBlock()->
begin();
560 targetBlock = forallOp->getBlock();
563 Block &sourceBlock = forallOp.getRegion().
front();
568 for (
Value loopIndex : forallOp.getInductionVars()) {
576 LDBG() <<
"----result forallMappingSizes: "
577 << llvm::interleaved(forallMappingSizes);
578 LDBG() <<
"----result mappingIdOps: " << llvm::interleaved(mappingIdOps);
589 RewriterBase &rewriter, TransformOpInterface transformOp,
592 LDBG() <<
"Start mapForallToBlocksImpl";
600 if (!
diag.succeeded())
605 Block *parentBlock = forallOp->getBlock();
617 rewriter, transformOp, forallOp,
618 gridDims, rewriteResult, gpuIdBuilder);
622 if (!
diag.succeeded())
626 if (gridDims.empty()) {
628 while (gridDims.size() < 3)
629 gridDims.push_back(1);
631 assert(gridDims.size() == 3 &&
"Need 3-D gridDims");
643 scf::ForallOp &topLevelForallOp,
644 TransformOpInterface transformOp) {
645 auto walkResult =
target->walk([&](scf::ForallOp forallOp) {
646 if (forallOp->getParentOfType<scf::ForallOp>())
648 if (topLevelForallOp)
651 topLevelForallOp = forallOp;
655 if (walkResult.wasInterrupted() || !topLevelForallOp)
656 return transformOp.emitSilenceableError()
657 <<
"could not find a unique topLevel scf.forall";
664 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(
target);
665 auto transformOp = cast<TransformOpInterface>(getOperation());
667 if (!getGenerateGpuLaunch() && !gpuLaunch) {
669 emitSilenceableError()
670 <<
"Given target is not gpu.launch, set `generate_gpu_launch` "
672 diag.attachNote(
target->getLoc()) <<
"when applied to this payload op";
676 scf::ForallOp topLevelForallOp;
678 target, topLevelForallOp, transformOp);
679 if (!
diag.succeeded()) {
680 diag.attachNote(
target->getLoc()) <<
"when applied to this payload op";
683 assert(topLevelForallOp &&
"expect an scf.forall");
686 if (!getGenerateGpuLaunch() && gridDims.size() != 3)
687 return transformOp.emitDefiniteFailure(
"transform require size-3 mapping");
693 if (getGenerateGpuLaunch()) {
696 if (!
diag.succeeded())
701 rewriter.
eraseOp(topLevelForallOp);
702 topLevelForallOp = cast<scf::ForallOp>(newForallOp);
706 bool useLinearMapping =
false;
707 if (topLevelForallOp.getMapping())
708 useLinearMapping = topLevelForallOp.usesLinearMapping();
710 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
711 topLevelForallOp.getDeviceMaskingAttr();
712 assert(succeeded(maybeMaskingAttr) &&
"unexpected failed maybeMaskingAttr");
713 assert((!*maybeMaskingAttr || useLinearMapping) &&
714 "masking requires linear mapping");
720 rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
721 if (!
diag.succeeded())
727 cast<TransformOpInterface>(getOperation()), gridDims[0],
728 gridDims[1], gridDims[2]);
734LogicalResult transform::MapForallToBlocks::verify() {
735 if (!getGridDims().empty() && getGridDims().size() != 3) {
736 return emitOpError() <<
"transform requires empty or size-3 grid_dims";
746 std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
748 int factor,
bool useLinearMapping =
false) {
749 if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
751 transformOp, forallOp,
752 Twine(
"3-D mapping: size of threadIdx.x must be a multiple of ") +
759 transformOp, forallOp,
760 Twine(
"the number of required parallel resources (blocks or "
763 " overflows the number of available resources " +
774 DeviceMappingAttrInterface mappingAttr =
775 forallOp.getDeviceMappingAttrs().front();
776 bool useLinearMapping = mappingAttr.isLinearMapping();
779 auto numParallelIterations =
781 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
783 transformOp, forallOp,
784 "requires statically sized, normalized forall op");
787 if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
789 }
else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
794 blockSizes, factor, useLinearMapping);
795 if (!
diag.succeeded())
798 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
799 forallOp.getDeviceMaskingAttr();
800 assert(succeeded(maybeMaskingAttr) &&
"unexpected failed maybeMaskingAttr");
801 assert((!*maybeMaskingAttr || useLinearMapping) &&
802 "masking requires linear mapping");
808 .Case([&](GPUWarpgroupMappingAttr) {
812 .Case([&](GPUWarpMappingAttr) {
816 .Case([&](GPUThreadMappingAttr) {
819 .Case([&](GPULaneMappingAttr) {
823 .DefaultUnreachable(
"unknown mapping attribute");
828 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
830 bool syncAfterDistribute) {
838 if (!
diag.succeeded())
846 transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
847 if (!
diag.succeeded())
857 rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
858 if (!
diag.succeeded())
861 if (syncAfterDistribute)
862 BarrierOp::create(rewriter, loc);
868 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
870 bool syncAfterDistribute) {
871 LDBG() <<
"Start mapNestedForallToThreadsImpl";
872 if (blockDims.size() != 3) {
874 "requires size-3 thread mapping");
883 rewriter, transformOp, forallOp, blockDims, warpSize,
884 syncAfterDistribute);
885 if (
diag.isDefiniteFailure())
887 if (
diag.succeeded())
905 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(
target);
906 auto transformOp = cast<TransformOpInterface>(getOperation());
910 return emitSilenceableError() <<
"Given target is not a gpu.launch";
915 checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
916 blockDims[0], blockDims[1], blockDims[2]);
917 if (
diag.isSilenceableFailure()) {
918 diag.attachNote(getLoc()) << getBlockDimsAttrName() <<
" is too large";
925 std::nullopt, std::nullopt, blockDims[0], blockDims[1],
931 getWarpSize(), getSyncAfterDistribute());
933 results.
push_back(gpuLaunch.getOperation());
944class GPUTransformDialectExtension
946 GPUTransformDialectExtension> {
950 GPUTransformDialectExtension() {
951 declareGeneratedDialect<GPUDialect>();
952 declareGeneratedDialect<amdgpu::AMDGPUDialect>();
953 declareGeneratedDialect<arith::ArithDialect>();
954 declareGeneratedDialect<scf::SCFDialect>();
955 registerTransformOps<
957#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
963#define GET_OP_CLASSES
964#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
p<< " : "<< getMemRefType()<< ", "<< getType();}static LogicalResult verifyVectorMemoryOp(Operation *op, MemRefType memrefType, VectorType vectorType) { if(memrefType.getElementType() !=vectorType.getElementType()) return op-> emitOpError("requires memref and vector types of the same elemental type")
Given a list of lists of parsed operands, populates uniqueOperands with unique operands.
*if copies could not be generated due to yet unimplemented cases *copyInPlacementStart and copyOutPlacementStart in copyPlacementBlock *specify the insertion points where the incoming copies and outgoing should be the output argument nBegin is set to its * replacement(set to `begin` if no invalidation happens). Since outgoing *copies could have been inserted at `end`
static std::string diag(const llvm::Value &value)
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Base type for affine expression.
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
OpListType::iterator iterator
OpListType & getOperations()
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
bool succeeded() const
Returns true if this is a success.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
void addExtensions()
Add the given extensions to the registry.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Conversion from types to the LLVM IR dialect.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Operation is the basic unit of execution within MLIR.
result_type_range getResultTypes()
user_range getUsers()
Returns a range of all users.
unsigned getNumResults()
Return the number of results held by this operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
virtual void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
void registerTransformDialectExtension(DialectRegistry ®istry)
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
uint64_t getN(LevelType lt)
uint64_t getM(LevelType lt)
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
llvm::DenseSet< ValueT, ValueInfoT > DenseSet
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Type convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
llvm::SetVector< T, Vector, Set, N > SetVector
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
const FrozenRewritePatternSet & patterns
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
llvm::TypeSwitch< T, ResultT > TypeSwitch
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
std::optional< SmallVector< int64_t > > getConstantIntValues(ArrayRef< OpFoldResult > ofrs)
If all ofrs are constant integers or IntegerAttrs, return the integers.
SmallVector< Value > getValuesSortedByKey(ArrayRef< Attribute > keys, ArrayRef< Value > values, llvm::function_ref< bool(Attribute, Attribute)> compare)
Helper to sort values according to matching keys.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns)
Erase barriers that do not enforce conflicting memory side effects.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Struct to return the result of the rewrite of a forall operation.
SmallVector< Value > mappingIds
SmallVector< int64_t > mappingSizes
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Options that control the vector unrolling.