40 #include "llvm/ADT/STLExtras.h"
41 #include "llvm/ADT/SmallVector.h"
42 #include "llvm/ADT/TypeSwitch.h"
43 #include "llvm/Support/DebugLog.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/InterleavedRange.h"
46 #include "llvm/Support/LogicalResult.h"
48 #include <type_traits>
55 #define DEBUG_TYPE "gpu-transforms"
61 void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
71 llvmTypeConverter, [](AddressSpace space) ->
unsigned {
73 case AddressSpace::Global:
74 return static_cast<unsigned>(
76 case AddressSpace::Workgroup:
77 return static_cast<unsigned>(
79 case AddressSpace::Private:
82 llvm_unreachable(
"unknown address space enum value");
87 llvmTypeConverter.addConversion(
95 transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
96 transform::TypeConverterBuilderOpInterface builder) {
97 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
98 return emitOpError(
"expected LLVMTypeConverter");
102 void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
109 transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
110 transform::TypeConverterBuilderOpInterface builder) {
111 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
112 return emitOpError(
"expected LLVMTypeConverter");
116 void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
123 LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
124 verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
125 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
126 return emitOpError(
"expected LLVMTypeConverter");
130 void transform::ApplyGPUToROCDLConversionPatternsOp::populatePatterns(
134 llvmTypeConverter, [](AddressSpace space) {
136 case AddressSpace::Global:
137 return ROCDL::ROCDLDialect::kGlobalMemoryAddressSpace;
138 case AddressSpace::Workgroup:
139 return ROCDL::ROCDLDialect::kSharedMemoryAddressSpace;
140 case AddressSpace::Private:
141 return ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace;
143 llvm_unreachable(
"unknown address space enum value");
145 FailureOr<amdgpu::Chipset> maybeChipset =
147 assert(llvm::succeeded(maybeChipset) &&
"expected valid chipset");
153 transform::ApplyGPUToROCDLConversionPatternsOp::verifyTypeConverter(
154 transform::TypeConverterBuilderOpInterface builder) {
155 FailureOr<amdgpu::Chipset> maybeChipset =
157 if (
failed(maybeChipset)) {
158 return emitOpError(
"Invalid chipset name: " + getChipset());
160 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
161 return emitOpError(
"expected LLVMTypeConverter");
173 void transform::ApplyGPUPromoteShuffleToAMDGPUPatternsOp::populatePatterns(
175 std::optional<StringRef> chipsetName = getChipset();
176 std::optional<amdgpu::Chipset> maybeChipset;
178 FailureOr<amdgpu::Chipset> parsedChipset =
180 assert(llvm::succeeded(parsedChipset) &&
"expected valid chipset");
181 maybeChipset = parsedChipset;
193 static std::optional<SmallVector<int64_t>>
199 order.push_back(index);
203 llvm::SmallDenseSet<int64_t> dims;
205 dims.insert(cast<AffineDimExpr>(expr).getPosition());
210 order.push_back(index);
216 order.push_back(index);
224 static std::optional<SmallVector<int64_t>>
226 if (
auto contract = dyn_cast<vector::ContractionOp>(op)) {
227 int64_t contractRank =
contract.getIteratorTypes().size();
228 if (contractRank < 3)
231 nativeSize.append({m, n, k});
234 if (
auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
235 int64_t writeRank = writeOp.getVectorType().getRank();
239 nativeSize.append({m, n});
242 if (
auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
245 VectorType sliceType;
247 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
250 auto vecType = cast<VectorType>(extract.getResult().getType());
251 if (sliceType && sliceType != vecType)
255 return llvm::to_vector(sliceType.getShape());
258 if (
auto vecType = dyn_cast<VectorType>(op->
getResultTypes()[0])) {
261 if (vecType.getRank() < 2)
268 VectorType sliceType;
270 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
273 auto vecType = cast<VectorType>(extract.getResult().getType());
274 if (sliceType && sliceType != vecType)
279 return llvm::to_vector(sliceType.getShape());
284 nativeSize.append({m, n});
291 void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
294 auto contract = dyn_cast<vector::ContractionOp>(op);
307 vector::populateVectorUnrollPatterns(
309 .setNativeShapeFn(nativeShapeFn)
310 .setUnrollTraversalOrderFn(unrollOrder));
327 struct MappingKind {};
328 struct BlockMappingKind : MappingKind {};
329 struct ThreadMappingKind : MappingKind {};
334 Operation *target,
const Twine &message) {
335 if (transformOp.has_value())
336 return transformOp->emitDefiniteFailure() << message;
341 template <
typename MappingKindType>
344 scf::ForallOp forallOp) {
345 if (!forallOp.getMapping().has_value()) {
347 "scf.forall op requires a mapping attribute");
350 bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),
351 llvm::IsaPred<GPUBlockMappingAttr>);
352 bool hasWarpgroupMapping = llvm::any_of(
353 forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);
354 bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),
355 llvm::IsaPred<GPUWarpMappingAttr>);
356 bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
357 llvm::IsaPred<GPUThreadMappingAttr>);
358 bool hasLaneMapping = llvm::any_of(forallOp.getMapping().value(),
359 llvm::IsaPred<GPULaneMappingAttr>);
360 int64_t countMappingTypes = 0;
361 countMappingTypes += hasBlockMapping ? 1 : 0;
362 countMappingTypes += hasWarpgroupMapping ? 1 : 0;
363 countMappingTypes += hasWarpMapping ? 1 : 0;
364 countMappingTypes += hasThreadMapping ? 1 : 0;
365 countMappingTypes += hasLaneMapping ? 1 : 0;
366 if (countMappingTypes > 1) {
368 transformOp, forallOp,
369 "cannot mix different mapping types, use nesting");
371 if (std::is_same<MappingKindType, BlockMappingKind>::value &&
374 transformOp, forallOp,
375 "scf.forall op requires a mapping attribute of kind 'block'");
377 if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
378 !hasLaneMapping && !hasThreadMapping && !hasWarpMapping &&
379 !hasWarpgroupMapping) {
381 "scf.forall op requires a mapping attribute "
382 "of kind 'thread' or 'warp'");
386 for (
Attribute map : forallOp.getMapping()->getValue()) {
387 if (seen.contains(map)) {
389 transformOp, forallOp,
390 "duplicate attribute, cannot map different loops "
391 "to the same mapping id");
396 auto isLinear = [](DeviceMappingAttrInterface attr) {
397 return attr.isLinearMapping();
399 if (llvm::any_of(forallOp.getDeviceMappingAttrs(), isLinear) &&
400 !llvm::all_of(forallOp.getDeviceMappingAttrs(), isLinear)) {
402 transformOp, forallOp,
403 "cannot mix linear and non-linear mapping modes");
406 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
407 forallOp.getDeviceMaskingAttr();
408 if (succeeded(maybeMaskingAttr) && *maybeMaskingAttr &&
409 !forallOp.usesLinearMapping()) {
411 transformOp, forallOp,
412 "device masking is only available in linear mapping mode");
418 template <
typename MappingKindType>
421 scf::ForallOp forallOp) {
424 checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);
429 if (!forallOp.isNormalized())
431 "unsupported non-normalized loops");
432 if (forallOp.getNumResults() > 0)
434 "only bufferized scf.forall can be mapped");
435 bool useLinearMapping = forallOp.usesLinearMapping();
438 int64_t maxNumMappingsSupported =
439 useLinearMapping ? (getMaxEnumValForMappingId() -
440 static_cast<uint64_t
>(MappingId::DimZ))
442 if (forallOp.getRank() > maxNumMappingsSupported) {
444 "scf.forall with rank > ")
445 << maxNumMappingsSupported
446 <<
" does not lower for the specified mapping attribute type";
448 auto numParallelIterations =
450 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
452 transformOp, forallOp,
453 "requires statically sized, normalized forall op");
465 template <
typename OpTy,
typename OperationOrBlock>
468 OperationOrBlock *parent,
Value replacement,
470 parent->walk([&](OpTy idOp) {
471 if (availableMappingSizes[
static_cast<int64_t
>(idOp.getDimension())] == 1)
477 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
480 LDBG() <<
"--start rewriteOneForallCommonImpl";
483 auto numParallelIterations =
485 assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
486 "requires statically sized, normalized forall op");
489 forallOp.getDeviceMappingAttrs();
491 forallMappingAttrs.insert_range(forallMappingAttrsVec);
493 return cast<DeviceMappingAttrInterface>(a).getMappingId() <
494 cast<DeviceMappingAttrInterface>(b).getMappingId();
500 DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(
501 *llvm::max_element(forallMappingAttrs, comparator));
502 DeviceMappingAttrInterface maxLinearMapping;
503 if (maxMapping.isLinearMapping())
504 maxLinearMapping = maxMapping;
507 if (maxLinearMapping && comparator(maxLinearMapping, attr))
510 if (!forallMappingAttrs.insert(attr))
513 tmpMappingSizes.push_back(1);
515 LDBG() <<
"----tmpMappingSizes extracted from scf.forall op: "
516 << llvm::interleaved(tmpMappingSizes);
520 forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
521 LDBG() <<
"----forallMappingSizes: " << llvm::interleaved(forallMappingSizes);
522 LDBG() <<
"----forallMappingAttrs: " << llvm::interleaved(forallMappingAttrs);
529 bool originalBasisWasProvided = !originalBasis.empty();
530 if (!originalBasisWasProvided) {
531 LDBG() <<
"----originalBasis was not provided, deriving it and there will "
534 originalBasis = forallMappingSizes;
535 while (originalBasis.size() < 3)
536 originalBasis.push_back(1);
538 LDBG() <<
"----originalBasis was provided, using it, there will be "
541 LDBG() <<
"------originalBasis: " << llvm::interleaved(originalBasis);
544 gpuIdBuilder.
idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
545 if (!builderResult.
errorMsg.empty())
548 LDBG() << builderResult;
554 for (
auto [iv, dim] : llvm::zip_equal(
555 forallOp.getInductionVars(),
556 forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
557 auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
558 Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
559 LDBG() <<
"----map: " << iv <<
" to " << peIdOp;
567 if (originalBasisWasProvided) {
569 predicate = predicate ? arith::AndIOp::create(rewriter, loc, predicate,
577 rewriter.
eraseOp(forallOp.getTerminator());
582 auto ifOp = scf::IfOp::create(rewriter, loc, predicate,
584 targetBlock = ifOp.thenBlock();
585 insertionPoint = ifOp.thenBlock()->
begin();
589 targetBlock = forallOp->getBlock();
592 Block &sourceBlock = forallOp.getRegion().
front();
597 for (
Value loopIndex : forallOp.getInductionVars()) {
605 LDBG() <<
"----result forallMappingSizes: "
606 << llvm::interleaved(forallMappingSizes);
607 LDBG() <<
"----result mappingIdOps: " << llvm::interleaved(mappingIdOps);
618 RewriterBase &rewriter, TransformOpInterface transformOp,
621 LDBG() <<
"Start mapForallToBlocksImpl";
628 verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);
629 if (!
diag.succeeded())
634 Block *parentBlock = forallOp->getBlock();
646 rewriter, transformOp, forallOp,
647 gridDims, rewriteResult, gpuIdBuilder);
651 if (!
diag.succeeded())
655 if (gridDims.empty()) {
657 while (gridDims.size() < 3)
658 gridDims.push_back(1);
660 assert(gridDims.size() == 3 &&
"Need 3-D gridDims");
664 replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
672 scf::ForallOp &topLevelForallOp,
673 TransformOpInterface transformOp) {
674 auto walkResult = target->
walk([&](scf::ForallOp forallOp) {
675 if (forallOp->getParentOfType<scf::ForallOp>())
677 if (topLevelForallOp)
680 topLevelForallOp = forallOp;
684 if (walkResult.wasInterrupted() || !topLevelForallOp)
685 return transformOp.emitSilenceableError()
686 <<
"could not find a unique topLevel scf.forall";
693 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
694 auto transformOp = cast<TransformOpInterface>(getOperation());
696 if (!getGenerateGpuLaunch() && !gpuLaunch) {
698 emitSilenceableError()
699 <<
"Given target is not gpu.launch, set `generate_gpu_launch` "
701 diag.attachNote(target->
getLoc()) <<
"when applied to this payload op";
705 scf::ForallOp topLevelForallOp;
707 target, topLevelForallOp, transformOp);
708 if (!
diag.succeeded()) {
709 diag.attachNote(target->
getLoc()) <<
"when applied to this payload op";
712 assert(topLevelForallOp &&
"expect an scf.forall");
715 if (!getGenerateGpuLaunch() && gridDims.size() != 3)
716 return transformOp.emitDefiniteFailure(
"transform require size-3 mapping");
722 if (getGenerateGpuLaunch()) {
725 if (!
diag.succeeded())
730 rewriter.
eraseOp(topLevelForallOp);
731 topLevelForallOp = cast<scf::ForallOp>(newForallOp);
735 bool useLinearMapping =
false;
736 if (topLevelForallOp.getMapping())
737 useLinearMapping = topLevelForallOp.usesLinearMapping();
739 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
740 topLevelForallOp.getDeviceMaskingAttr();
741 assert(succeeded(maybeMaskingAttr) &&
"unexpected failed maybeMaskingAttr");
742 assert((!*maybeMaskingAttr || useLinearMapping) &&
743 "masking requires linear mapping");
749 rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
750 if (!
diag.succeeded())
756 cast<TransformOpInterface>(getOperation()), gridDims[0],
757 gridDims[1], gridDims[2]);
764 if (!getGridDims().empty() && getGridDims().size() != 3) {
765 return emitOpError() <<
"transform requires empty or size-3 grid_dims";
775 std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
777 int factor,
bool useLinearMapping =
false) {
778 if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
780 transformOp, forallOp,
781 Twine(
"3-D mapping: size of threadIdx.x must be a multiple of ") +
788 transformOp, forallOp,
789 Twine(
"the number of required parallel resources (blocks or "
792 " overflows the number of available resources " +
803 DeviceMappingAttrInterface mappingAttr =
804 forallOp.getDeviceMappingAttrs().front();
805 bool useLinearMapping = mappingAttr.isLinearMapping();
808 auto numParallelIterations =
810 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
812 transformOp, forallOp,
813 "requires statically sized, normalized forall op");
816 if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
817 factor = GpuWarpgroupIdBuilder::kNumWarpsPerGroup * warpSize;
818 }
else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
823 blockSizes, factor, useLinearMapping);
824 if (!
diag.succeeded())
827 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
828 forallOp.getDeviceMaskingAttr();
829 assert(succeeded(maybeMaskingAttr) &&
"unexpected failed maybeMaskingAttr");
830 assert((!*maybeMaskingAttr || useLinearMapping) &&
831 "masking requires linear mapping");
837 .Case([&](GPUWarpgroupMappingAttr) {
841 .Case([&](GPUWarpMappingAttr) {
845 .Case([&](GPUThreadMappingAttr) {
848 .Case([&](GPULaneMappingAttr) {
852 .Default([&](DeviceMappingAttrInterface) ->
GpuIdBuilder {
853 llvm_unreachable(
"unknown mapping attribute");
859 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
861 bool syncAfterDistribute) {
868 verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);
869 if (!
diag.succeeded())
877 transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
878 if (!
diag.succeeded())
888 rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
889 if (!
diag.succeeded())
892 if (syncAfterDistribute)
893 BarrierOp::create(rewriter, loc);
899 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
901 bool syncAfterDistribute) {
902 LDBG() <<
"Start mapNestedForallToThreadsImpl";
903 if (blockDims.size() != 3) {
905 "requires size-3 thread mapping");
912 WalkResult walkResult = target->
walk([&](scf::ForallOp forallOp) {
914 rewriter, transformOp, forallOp, blockDims, warpSize,
915 syncAfterDistribute);
916 if (
diag.isDefiniteFailure())
918 if (
diag.succeeded())
927 replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,
936 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
937 auto transformOp = cast<TransformOpInterface>(getOperation());
941 return emitSilenceableError() <<
"Given target is not a gpu.launch";
946 checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
947 blockDims[0], blockDims[1], blockDims[2]);
948 if (
diag.isSilenceableFailure()) {
949 diag.attachNote(getLoc()) << getBlockDimsAttrName() <<
" is too large";
956 std::nullopt, std::nullopt, blockDims[0], blockDims[1],
962 getWarpSize(), getSyncAfterDistribute());
964 results.
push_back(gpuLaunch.getOperation());
975 class GPUTransformDialectExtension
977 GPUTransformDialectExtension> {
981 GPUTransformDialectExtension() {
982 declareGeneratedDialect<GPUDialect>();
983 declareGeneratedDialect<amdgpu::AMDGPUDialect>();
984 declareGeneratedDialect<arith::ArithDialect>();
985 declareGeneratedDialect<scf::SCFDialect>();
986 registerTransformOps<
988 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
994 #define GET_OP_CLASSES
995 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
static constexpr int64_t kSharedMemorySpace
static MLIRContext * getContext(OpFoldResult val)
static std::string diag(const llvm::Value &value)
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Base type for affine expression.
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
OpListType::iterator iterator
OpListType & getOperations()
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
bool succeeded() const
Returns true if this is a success.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
void addExtensions()
Add the given extensions to the registry.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Conversion from types to the LLVM IR dialect.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Operation is the basic unit of execution within MLIR.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Location getLoc()
The source location the operation was defined or derived from.
result_type_range getResultTypes()
user_range getUsers()
Returns a range of all users.
unsigned getNumResults()
Return the number of results held by this operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
@ kGlobalMemorySpace
Global memory space identifier.
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
void registerTransformDialectExtension(DialectRegistry ®istry)
uint64_t getN(LevelType lt)
uint64_t getM(LevelType lt)
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
const FrozenRewritePatternSet & patterns
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
std::optional< SmallVector< int64_t > > getConstantIntValues(ArrayRef< OpFoldResult > ofrs)
If all ofrs are constant integers or IntegerAttrs, return the integers.
LogicalResult verify(Operation *op, bool verifyRecursively=true)
Perform (potentially expensive) checks of invariants, used to detect compiler bugs,...
SmallVector< Value > getValuesSortedByKey(ArrayRef< Attribute > keys, ArrayRef< Value > values, llvm::function_ref< bool(Attribute, Attribute)> compare)
Helper to sort values according to matching keys.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns)
Erase barriers that do not enforce conflicting memory side effects.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Struct to return the result of the rewrite of a forall operation.
SmallVector< Value > mappingIds
SmallVector< int64_t > mappingSizes
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Options that control the vector unrolling.