40 #include "llvm/ADT/STLExtras.h"
41 #include "llvm/ADT/SmallVector.h"
42 #include "llvm/ADT/TypeSwitch.h"
43 #include "llvm/Support/DebugLog.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/InterleavedRange.h"
46 #include "llvm/Support/LogicalResult.h"
48 #include <type_traits>
55 #define DEBUG_TYPE "gpu-transforms"
61 void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
71 llvmTypeConverter, [](AddressSpace space) ->
unsigned {
73 case AddressSpace::Global:
74 return static_cast<unsigned>(NVVM::NVVMMemorySpace::Global);
75 case AddressSpace::Workgroup:
76 return static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared);
77 case AddressSpace::Private:
80 llvm_unreachable(
"unknown address space enum value");
81 return static_cast<unsigned>(NVVM::NVVMMemorySpace::Generic);
85 llvmTypeConverter.addConversion(
93 transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
94 transform::TypeConverterBuilderOpInterface builder) {
95 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
96 return emitOpError(
"expected LLVMTypeConverter");
100 void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
107 transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
108 transform::TypeConverterBuilderOpInterface builder) {
109 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
110 return emitOpError(
"expected LLVMTypeConverter");
114 void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
121 LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
122 verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
123 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
124 return emitOpError(
"expected LLVMTypeConverter");
128 void transform::ApplyGPUToROCDLConversionPatternsOp::populatePatterns(
132 llvmTypeConverter, [](AddressSpace space) {
134 case AddressSpace::Global:
135 return ROCDL::ROCDLDialect::kGlobalMemoryAddressSpace;
136 case AddressSpace::Workgroup:
137 return ROCDL::ROCDLDialect::kSharedMemoryAddressSpace;
138 case AddressSpace::Private:
139 return ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace;
141 llvm_unreachable(
"unknown address space enum value");
143 FailureOr<amdgpu::Chipset> maybeChipset =
145 assert(llvm::succeeded(maybeChipset) &&
"expected valid chipset");
151 transform::ApplyGPUToROCDLConversionPatternsOp::verifyTypeConverter(
152 transform::TypeConverterBuilderOpInterface builder) {
153 FailureOr<amdgpu::Chipset> maybeChipset =
155 if (
failed(maybeChipset)) {
156 return emitOpError(
"Invalid chipset name: " + getChipset());
158 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
159 return emitOpError(
"expected LLVMTypeConverter");
171 void transform::ApplyGPUPromoteShuffleToAMDGPUPatternsOp::populatePatterns(
173 std::optional<StringRef> chipsetName = getChipset();
174 std::optional<amdgpu::Chipset> maybeChipset;
176 FailureOr<amdgpu::Chipset> parsedChipset =
178 assert(llvm::succeeded(parsedChipset) &&
"expected valid chipset");
179 maybeChipset = parsedChipset;
191 static std::optional<SmallVector<int64_t>>
197 order.push_back(index);
201 llvm::SmallDenseSet<int64_t> dims;
203 dims.insert(cast<AffineDimExpr>(expr).getPosition());
208 order.push_back(index);
214 order.push_back(index);
222 static std::optional<SmallVector<int64_t>>
224 if (
auto contract = dyn_cast<vector::ContractionOp>(op)) {
225 int64_t contractRank =
contract.getIteratorTypes().size();
226 if (contractRank < 3)
229 nativeSize.append({m, n, k});
232 if (
auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
233 int64_t writeRank = writeOp.getVectorType().getRank();
237 nativeSize.append({m, n});
240 if (
auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
243 VectorType sliceType;
245 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
248 auto vecType = cast<VectorType>(extract.getResult().getType());
249 if (sliceType && sliceType != vecType)
253 return llvm::to_vector(sliceType.getShape());
256 if (
auto vecType = dyn_cast<VectorType>(op->
getResultTypes()[0])) {
259 if (vecType.getRank() < 2)
266 VectorType sliceType;
268 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
271 auto vecType = cast<VectorType>(extract.getResult().getType());
272 if (sliceType && sliceType != vecType)
277 return llvm::to_vector(sliceType.getShape());
282 nativeSize.append({m, n});
289 void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
292 auto contract = dyn_cast<vector::ContractionOp>(op);
305 vector::populateVectorUnrollPatterns(
307 .setNativeShapeFn(nativeShapeFn)
308 .setUnrollTraversalOrderFn(unrollOrder));
325 struct MappingKind {};
326 struct BlockMappingKind : MappingKind {};
327 struct ThreadMappingKind : MappingKind {};
332 Operation *target,
const Twine &message) {
333 if (transformOp.has_value())
334 return transformOp->emitDefiniteFailure() << message;
339 template <
typename MappingKindType>
342 scf::ForallOp forallOp) {
343 if (!forallOp.getMapping().has_value()) {
345 "scf.forall op requires a mapping attribute");
348 bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),
349 llvm::IsaPred<GPUBlockMappingAttr>);
350 bool hasWarpgroupMapping = llvm::any_of(
351 forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);
352 bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),
353 llvm::IsaPred<GPUWarpMappingAttr>);
354 bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
355 llvm::IsaPred<GPUThreadMappingAttr>);
356 bool hasLaneMapping = llvm::any_of(forallOp.getMapping().value(),
357 llvm::IsaPred<GPULaneMappingAttr>);
358 int64_t countMappingTypes = 0;
359 countMappingTypes += hasBlockMapping ? 1 : 0;
360 countMappingTypes += hasWarpgroupMapping ? 1 : 0;
361 countMappingTypes += hasWarpMapping ? 1 : 0;
362 countMappingTypes += hasThreadMapping ? 1 : 0;
363 countMappingTypes += hasLaneMapping ? 1 : 0;
364 if (countMappingTypes > 1) {
366 transformOp, forallOp,
367 "cannot mix different mapping types, use nesting");
369 if (std::is_same<MappingKindType, BlockMappingKind>::value &&
372 transformOp, forallOp,
373 "scf.forall op requires a mapping attribute of kind 'block'");
375 if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
376 !hasLaneMapping && !hasThreadMapping && !hasWarpMapping &&
377 !hasWarpgroupMapping) {
379 "scf.forall op requires a mapping attribute "
380 "of kind 'thread' or 'warp'");
384 for (
Attribute map : forallOp.getMapping()->getValue()) {
385 if (seen.contains(map)) {
387 transformOp, forallOp,
388 "duplicate attribute, cannot map different loops "
389 "to the same mapping id");
394 auto isLinear = [](DeviceMappingAttrInterface attr) {
395 return attr.isLinearMapping();
397 if (llvm::any_of(forallOp.getDeviceMappingAttrs(), isLinear) &&
398 !llvm::all_of(forallOp.getDeviceMappingAttrs(), isLinear)) {
400 transformOp, forallOp,
401 "cannot mix linear and non-linear mapping modes");
404 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
405 forallOp.getDeviceMaskingAttr();
406 if (succeeded(maybeMaskingAttr) && *maybeMaskingAttr &&
407 !forallOp.usesLinearMapping()) {
409 transformOp, forallOp,
410 "device masking is only available in linear mapping mode");
416 template <
typename MappingKindType>
419 scf::ForallOp forallOp) {
422 checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);
427 if (!forallOp.isNormalized())
429 "unsupported non-normalized loops");
430 if (forallOp.getNumResults() > 0)
432 "only bufferized scf.forall can be mapped");
433 bool useLinearMapping = forallOp.usesLinearMapping();
436 int64_t maxNumMappingsSupported =
437 useLinearMapping ? (getMaxEnumValForMappingId() -
438 static_cast<uint64_t
>(MappingId::DimZ))
440 if (forallOp.getRank() > maxNumMappingsSupported) {
442 "scf.forall with rank > ")
443 << maxNumMappingsSupported
444 <<
" does not lower for the specified mapping attribute type";
446 auto numParallelIterations =
448 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
450 transformOp, forallOp,
451 "requires statically sized, normalized forall op");
463 template <
typename OpTy,
typename OperationOrBlock>
466 OperationOrBlock *parent,
Value replacement,
468 parent->walk([&](OpTy idOp) {
469 if (availableMappingSizes[
static_cast<int64_t
>(idOp.getDimension())] == 1)
475 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
478 LDBG() <<
"--start rewriteOneForallCommonImpl";
481 auto numParallelIterations =
483 assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
484 "requires statically sized, normalized forall op");
487 forallOp.getDeviceMappingAttrs();
489 forallMappingAttrs.insert_range(forallMappingAttrsVec);
491 return cast<DeviceMappingAttrInterface>(a).getMappingId() <
492 cast<DeviceMappingAttrInterface>(b).getMappingId();
498 DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(
499 *llvm::max_element(forallMappingAttrs, comparator));
500 DeviceMappingAttrInterface maxLinearMapping;
501 if (maxMapping.isLinearMapping())
502 maxLinearMapping = maxMapping;
505 if (maxLinearMapping && comparator(maxLinearMapping, attr))
508 if (!forallMappingAttrs.insert(attr))
511 tmpMappingSizes.push_back(1);
513 LDBG() <<
"----tmpMappingSizes extracted from scf.forall op: "
514 << llvm::interleaved(tmpMappingSizes);
518 forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
519 LDBG() <<
"----forallMappingSizes: " << llvm::interleaved(forallMappingSizes);
520 LDBG() <<
"----forallMappingAttrs: " << llvm::interleaved(forallMappingAttrs);
527 bool originalBasisWasProvided = !originalBasis.empty();
528 if (!originalBasisWasProvided) {
529 LDBG() <<
"----originalBasis was not provided, deriving it and there will "
532 originalBasis = forallMappingSizes;
533 while (originalBasis.size() < 3)
534 originalBasis.push_back(1);
536 LDBG() <<
"----originalBasis was provided, using it, there will be "
539 LDBG() <<
"------originalBasis: " << llvm::interleaved(originalBasis);
542 gpuIdBuilder.
idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
543 if (!builderResult.
errorMsg.empty())
546 LDBG() << builderResult;
552 for (
auto [iv, dim] : llvm::zip_equal(
553 forallOp.getInductionVars(),
554 forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
555 auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
556 Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
557 LDBG() <<
"----map: " << iv <<
" to " << peIdOp;
565 if (originalBasisWasProvided) {
567 predicate = predicate ? arith::AndIOp::create(rewriter, loc, predicate,
575 rewriter.
eraseOp(forallOp.getTerminator());
580 auto ifOp = scf::IfOp::create(rewriter, loc, predicate,
582 targetBlock = ifOp.thenBlock();
583 insertionPoint = ifOp.thenBlock()->
begin();
587 targetBlock = forallOp->getBlock();
590 Block &sourceBlock = forallOp.getRegion().
front();
595 for (
Value loopIndex : forallOp.getInductionVars()) {
603 LDBG() <<
"----result forallMappingSizes: "
604 << llvm::interleaved(forallMappingSizes);
605 LDBG() <<
"----result mappingIdOps: " << llvm::interleaved(mappingIdOps);
616 RewriterBase &rewriter, TransformOpInterface transformOp,
619 LDBG() <<
"Start mapForallToBlocksImpl";
626 verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);
627 if (!
diag.succeeded())
632 Block *parentBlock = forallOp->getBlock();
644 rewriter, transformOp, forallOp,
645 gridDims, rewriteResult, gpuIdBuilder);
649 if (!
diag.succeeded())
653 if (gridDims.empty()) {
655 while (gridDims.size() < 3)
656 gridDims.push_back(1);
658 assert(gridDims.size() == 3 &&
"Need 3-D gridDims");
662 replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
670 scf::ForallOp &topLevelForallOp,
671 TransformOpInterface transformOp) {
672 auto walkResult = target->
walk([&](scf::ForallOp forallOp) {
673 if (forallOp->getParentOfType<scf::ForallOp>())
675 if (topLevelForallOp)
678 topLevelForallOp = forallOp;
682 if (walkResult.wasInterrupted() || !topLevelForallOp)
683 return transformOp.emitSilenceableError()
684 <<
"could not find a unique topLevel scf.forall";
691 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
692 auto transformOp = cast<TransformOpInterface>(getOperation());
694 if (!getGenerateGpuLaunch() && !gpuLaunch) {
696 emitSilenceableError()
697 <<
"Given target is not gpu.launch, set `generate_gpu_launch` "
699 diag.attachNote(target->
getLoc()) <<
"when applied to this payload op";
703 scf::ForallOp topLevelForallOp;
705 target, topLevelForallOp, transformOp);
706 if (!
diag.succeeded()) {
707 diag.attachNote(target->
getLoc()) <<
"when applied to this payload op";
710 assert(topLevelForallOp &&
"expect an scf.forall");
713 if (!getGenerateGpuLaunch() && gridDims.size() != 3)
714 return transformOp.emitDefiniteFailure(
"transform require size-3 mapping");
720 if (getGenerateGpuLaunch()) {
723 if (!
diag.succeeded())
728 rewriter.
eraseOp(topLevelForallOp);
729 topLevelForallOp = cast<scf::ForallOp>(newForallOp);
733 bool useLinearMapping =
false;
734 if (topLevelForallOp.getMapping())
735 useLinearMapping = topLevelForallOp.usesLinearMapping();
737 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
738 topLevelForallOp.getDeviceMaskingAttr();
739 assert(succeeded(maybeMaskingAttr) &&
"unexpected failed maybeMaskingAttr");
740 assert((!*maybeMaskingAttr || useLinearMapping) &&
741 "masking requires linear mapping");
747 rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
748 if (!
diag.succeeded())
754 cast<TransformOpInterface>(getOperation()), gridDims[0],
755 gridDims[1], gridDims[2]);
762 if (!getGridDims().empty() && getGridDims().size() != 3) {
763 return emitOpError() <<
"transform requires empty or size-3 grid_dims";
773 std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
775 int factor,
bool useLinearMapping =
false) {
776 if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
778 transformOp, forallOp,
779 Twine(
"3-D mapping: size of threadIdx.x must be a multiple of ") +
786 transformOp, forallOp,
787 Twine(
"the number of required parallel resources (blocks or "
790 " overflows the number of available resources " +
801 DeviceMappingAttrInterface mappingAttr =
802 forallOp.getDeviceMappingAttrs().front();
803 bool useLinearMapping = mappingAttr.isLinearMapping();
806 auto numParallelIterations =
808 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
810 transformOp, forallOp,
811 "requires statically sized, normalized forall op");
814 if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
815 factor = GpuWarpgroupIdBuilder::kNumWarpsPerGroup * warpSize;
816 }
else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
821 blockSizes, factor, useLinearMapping);
822 if (!
diag.succeeded())
825 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
826 forallOp.getDeviceMaskingAttr();
827 assert(succeeded(maybeMaskingAttr) &&
"unexpected failed maybeMaskingAttr");
828 assert((!*maybeMaskingAttr || useLinearMapping) &&
829 "masking requires linear mapping");
835 .Case([&](GPUWarpgroupMappingAttr) {
839 .Case([&](GPUWarpMappingAttr) {
843 .Case([&](GPUThreadMappingAttr) {
846 .Case([&](GPULaneMappingAttr) {
850 .Default([&](DeviceMappingAttrInterface) ->
GpuIdBuilder {
851 llvm_unreachable(
"unknown mapping attribute");
857 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
859 bool syncAfterDistribute) {
866 verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);
867 if (!
diag.succeeded())
875 transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
876 if (!
diag.succeeded())
886 rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
887 if (!
diag.succeeded())
890 if (syncAfterDistribute)
891 BarrierOp::create(rewriter, loc);
897 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
899 bool syncAfterDistribute) {
900 LDBG() <<
"Start mapNestedForallToThreadsImpl";
901 if (blockDims.size() != 3) {
903 "requires size-3 thread mapping");
910 WalkResult walkResult = target->
walk([&](scf::ForallOp forallOp) {
912 rewriter, transformOp, forallOp, blockDims, warpSize,
913 syncAfterDistribute);
914 if (
diag.isDefiniteFailure())
916 if (
diag.succeeded())
925 replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,
934 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
935 auto transformOp = cast<TransformOpInterface>(getOperation());
939 return emitSilenceableError() <<
"Given target is not a gpu.launch";
944 checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
945 blockDims[0], blockDims[1], blockDims[2]);
946 if (
diag.isSilenceableFailure()) {
947 diag.attachNote(getLoc()) << getBlockDimsAttrName() <<
" is too large";
954 std::nullopt, std::nullopt, blockDims[0], blockDims[1],
960 getWarpSize(), getSyncAfterDistribute());
962 results.
push_back(gpuLaunch.getOperation());
973 class GPUTransformDialectExtension
975 GPUTransformDialectExtension> {
979 GPUTransformDialectExtension() {
980 declareGeneratedDialect<GPUDialect>();
981 declareGeneratedDialect<amdgpu::AMDGPUDialect>();
982 declareGeneratedDialect<arith::ArithDialect>();
983 declareGeneratedDialect<scf::SCFDialect>();
984 registerTransformOps<
986 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
992 #define GET_OP_CLASSES
993 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
static MLIRContext * getContext(OpFoldResult val)
static std::string diag(const llvm::Value &value)
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Base type for affine expression.
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
OpListType::iterator iterator
OpListType & getOperations()
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
bool succeeded() const
Returns true if this is a success.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
void addExtensions()
Add the given extensions to the registry.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Conversion from types to the LLVM IR dialect.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Operation is the basic unit of execution within MLIR.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Location getLoc()
The source location the operation was defined or derived from.
result_type_range getResultTypes()
user_range getUsers()
Returns a range of all users.
unsigned getNumResults()
Return the number of results held by this operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
virtual void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
void registerTransformDialectExtension(DialectRegistry ®istry)
uint64_t getN(LevelType lt)
uint64_t getM(LevelType lt)
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
const FrozenRewritePatternSet & patterns
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
std::optional< SmallVector< int64_t > > getConstantIntValues(ArrayRef< OpFoldResult > ofrs)
If all ofrs are constant integers or IntegerAttrs, return the integers.
LogicalResult verify(Operation *op, bool verifyRecursively=true)
Perform (potentially expensive) checks of invariants, used to detect compiler bugs,...
SmallVector< Value > getValuesSortedByKey(ArrayRef< Attribute > keys, ArrayRef< Value > values, llvm::function_ref< bool(Attribute, Attribute)> compare)
Helper to sort values according to matching keys.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns)
Erase barriers that do not enforce conflicting memory side effects.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Struct to return the result of the rewrite of a forall operation.
SmallVector< Value > mappingIds
SmallVector< int64_t > mappingSizes
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Options that control the vector unrolling.