40#include "llvm/ADT/STLExtras.h"
41#include "llvm/ADT/SmallVector.h"
42#include "llvm/ADT/TypeSwitch.h"
43#include "llvm/Support/DebugLog.h"
44#include "llvm/Support/ErrorHandling.h"
45#include "llvm/Support/InterleavedRange.h"
46#include "llvm/Support/LogicalResult.h"
55#define DEBUG_TYPE "gpu-transforms"
61void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
71 llvmTypeConverter, [](AddressSpace space) ->
unsigned {
73 case AddressSpace::Global:
74 return static_cast<unsigned>(NVVM::NVVMMemorySpace::Global);
75 case AddressSpace::Workgroup:
76 return static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared);
77 case AddressSpace::Private:
80 llvm_unreachable(
"unknown address space enum value");
81 return static_cast<unsigned>(NVVM::NVVMMemorySpace::Generic);
85 llvmTypeConverter.addConversion(
93transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
94 transform::TypeConverterBuilderOpInterface builder) {
95 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
100void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
107transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
108 transform::TypeConverterBuilderOpInterface builder) {
109 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
114void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
121LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
122 verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
123 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
128void transform::ApplyGPUToROCDLConversionPatternsOp::populatePatterns(
132 llvmTypeConverter, [](AddressSpace space) {
134 case AddressSpace::Global:
135 return ROCDL::ROCDLDialect::kGlobalMemoryAddressSpace;
136 case AddressSpace::Workgroup:
137 return ROCDL::ROCDLDialect::kSharedMemoryAddressSpace;
138 case AddressSpace::Private:
139 return ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace;
141 llvm_unreachable(
"unknown address space enum value");
143 FailureOr<amdgpu::Chipset> maybeChipset =
145 assert(llvm::succeeded(maybeChipset) &&
"expected valid chipset");
151transform::ApplyGPUToROCDLConversionPatternsOp::verifyTypeConverter(
152 transform::TypeConverterBuilderOpInterface builder) {
153 FailureOr<amdgpu::Chipset> maybeChipset =
155 if (
failed(maybeChipset)) {
156 return emitOpError(
"Invalid chipset name: " + getChipset());
158 if (builder.getTypeConverterType() !=
"LLVMTypeConverter")
171void transform::ApplyGPUPromoteShuffleToAMDGPUPatternsOp::populatePatterns(
173 std::optional<StringRef> chipsetName = getChipset();
174 std::optional<amdgpu::Chipset> maybeChipset;
176 FailureOr<amdgpu::Chipset> parsedChipset =
178 assert(llvm::succeeded(parsedChipset) &&
"expected valid chipset");
179 maybeChipset = parsedChipset;
191static std::optional<SmallVector<int64_t>>
195 for (
auto [
index, iter] : llvm::enumerate(
contract.getIteratorTypes())) {
197 order.push_back(
index);
201 llvm::SmallDenseSet<int64_t> dims;
203 dims.insert(cast<AffineDimExpr>(expr).getPosition());
206 for (
auto [
index, iter] : llvm::enumerate(
contract.getIteratorTypes())) {
208 order.push_back(
index);
212 for (
auto [
index, iter] : llvm::enumerate(
contract.getIteratorTypes())) {
214 order.push_back(
index);
222static std::optional<SmallVector<int64_t>>
224 if (
auto contract = dyn_cast<vector::ContractionOp>(op)) {
226 if (contractRank < 3)
229 nativeSize.append({m, n, k});
232 if (
auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
233 int64_t writeRank = writeOp.getVectorType().getRank();
237 nativeSize.append({m, n});
240 if (
auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
243 VectorType sliceType;
245 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
248 auto vecType = cast<VectorType>(extract.getResult().getType());
249 if (sliceType && sliceType != vecType)
253 return llvm::to_vector(sliceType.getShape());
256 if (
auto vecType = dyn_cast<VectorType>(op->
getResultTypes()[0])) {
259 if (vecType.getRank() < 2)
266 VectorType sliceType;
268 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
271 auto vecType = cast<VectorType>(extract.getResult().getType());
272 if (sliceType && sliceType != vecType)
277 return llvm::to_vector(sliceType.getShape());
282 nativeSize.append({m, n});
289void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
292 auto contract = dyn_cast<vector::ContractionOp>(op);
305 vector::populateVectorUnrollPatterns(
307 .setNativeShapeFn(nativeShapeFn)
308 .setUnrollTraversalOrderFn(unrollOrder));
325struct MappingKind {};
326struct BlockMappingKind : MappingKind {};
327struct ThreadMappingKind : MappingKind {};
333 if (transformOp.has_value())
334 return transformOp->emitDefiniteFailure() << message;
339template <
typename MappingKindType>
342 scf::ForallOp forallOp) {
343 if (!forallOp.getMapping().has_value()) {
345 "scf.forall op requires a mapping attribute");
348 bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),
349 llvm::IsaPred<GPUBlockMappingAttr>);
350 bool hasWarpgroupMapping = llvm::any_of(
351 forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);
352 bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),
353 llvm::IsaPred<GPUWarpMappingAttr>);
354 bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
355 llvm::IsaPred<GPUThreadMappingAttr>);
356 bool hasLaneMapping = llvm::any_of(forallOp.getMapping().value(),
357 llvm::IsaPred<GPULaneMappingAttr>);
359 countMappingTypes += hasBlockMapping ? 1 : 0;
360 countMappingTypes += hasWarpgroupMapping ? 1 : 0;
361 countMappingTypes += hasWarpMapping ? 1 : 0;
362 countMappingTypes += hasThreadMapping ? 1 : 0;
363 countMappingTypes += hasLaneMapping ? 1 : 0;
364 if (countMappingTypes > 1) {
366 transformOp, forallOp,
367 "cannot mix different mapping types, use nesting");
369 if (std::is_same<MappingKindType, BlockMappingKind>::value &&
372 transformOp, forallOp,
373 "scf.forall op requires a mapping attribute of kind 'block'");
375 if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
376 !hasLaneMapping && !hasThreadMapping && !hasWarpMapping &&
377 !hasWarpgroupMapping) {
379 "scf.forall op requires a mapping attribute "
380 "of kind 'thread' or 'warp'");
384 for (
Attribute map : forallOp.getMapping()->getValue()) {
385 if (seen.contains(map)) {
387 transformOp, forallOp,
388 "duplicate attribute, cannot map different loops "
389 "to the same mapping id");
394 auto isLinear = [](DeviceMappingAttrInterface attr) {
395 return attr.isLinearMapping();
397 if (llvm::any_of(forallOp.getDeviceMappingAttrs(), isLinear) &&
398 !llvm::all_of(forallOp.getDeviceMappingAttrs(), isLinear)) {
400 transformOp, forallOp,
401 "cannot mix linear and non-linear mapping modes");
404 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
405 forallOp.getDeviceMaskingAttr();
406 if (succeeded(maybeMaskingAttr) && *maybeMaskingAttr &&
407 !forallOp.usesLinearMapping()) {
409 transformOp, forallOp,
410 "device masking is only available in linear mapping mode");
416template <
typename MappingKindType>
419 scf::ForallOp forallOp) {
427 if (!forallOp.isNormalized())
429 "unsupported non-normalized loops");
430 if (forallOp.getNumResults() > 0)
432 "only bufferized scf.forall can be mapped");
433 bool useLinearMapping = forallOp.usesLinearMapping();
436 int64_t maxNumMappingsSupported =
437 useLinearMapping ? (getMaxEnumValForMappingId() -
438 static_cast<uint64_t
>(MappingId::DimZ))
440 if (forallOp.getRank() > maxNumMappingsSupported) {
442 "scf.forall with rank > ")
443 << maxNumMappingsSupported
444 <<
" does not lower for the specified mapping attribute type";
446 auto numParallelIterations =
448 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
450 transformOp, forallOp,
451 "requires statically sized, normalized forall op");
463template <
typename OpTy,
typename OperationOrBlock>
468 parent->walk([&](OpTy idOp) {
469 if (availableMappingSizes[
static_cast<int64_t>(idOp.getDimension())] == 1)
475 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
478 LDBG() <<
"--start rewriteOneForallCommonImpl";
481 auto numParallelIterations =
483 assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
484 "requires statically sized, normalized forall op");
487 forallOp.getDeviceMappingAttrs();
489 forallMappingAttrs.insert_range(forallMappingAttrsVec);
491 return cast<DeviceMappingAttrInterface>(a).getMappingId() <
492 cast<DeviceMappingAttrInterface>(
b).getMappingId();
498 DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(
499 *llvm::max_element(forallMappingAttrs, comparator));
500 DeviceMappingAttrInterface maxLinearMapping;
501 if (maxMapping.isLinearMapping())
502 maxLinearMapping = maxMapping;
505 if (maxLinearMapping && comparator(maxLinearMapping, attr))
508 if (!forallMappingAttrs.insert(attr))
511 tmpMappingSizes.push_back(1);
513 LDBG() <<
"----tmpMappingSizes extracted from scf.forall op: "
514 << llvm::interleaved(tmpMappingSizes);
518 forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
519 LDBG() <<
"----forallMappingSizes: " << llvm::interleaved(forallMappingSizes);
520 LDBG() <<
"----forallMappingAttrs: " << llvm::interleaved(forallMappingAttrs);
527 bool originalBasisWasProvided = !originalBasis.empty();
528 if (!originalBasisWasProvided) {
529 LDBG() <<
"----originalBasis was not provided, deriving it and there will "
532 originalBasis = forallMappingSizes;
533 while (originalBasis.size() < 3)
534 originalBasis.push_back(1);
536 LDBG() <<
"----originalBasis was provided, using it, there will be "
539 LDBG() <<
"------originalBasis: " << llvm::interleaved(originalBasis);
542 gpuIdBuilder.
idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
543 if (!builderResult.
errorMsg.empty())
546 LDBG() << builderResult;
552 for (
auto [iv, dim] : llvm::zip_equal(
553 forallOp.getInductionVars(),
554 forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
555 auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
556 Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
557 LDBG() <<
"----map: " << iv <<
" to " << peIdOp;
565 if (originalBasisWasProvided) {
567 predicate = predicate ? arith::AndIOp::create(rewriter, loc, predicate,
575 rewriter.
eraseOp(forallOp.getTerminator());
580 auto ifOp = scf::IfOp::create(rewriter, loc, predicate,
582 targetBlock = ifOp.thenBlock();
583 insertionPoint = ifOp.thenBlock()->
begin();
587 targetBlock = forallOp->getBlock();
590 Block &sourceBlock = forallOp.getRegion().
front();
595 for (
Value loopIndex : forallOp.getInductionVars()) {
603 LDBG() <<
"----result forallMappingSizes: "
604 << llvm::interleaved(forallMappingSizes);
605 LDBG() <<
"----result mappingIdOps: " << llvm::interleaved(mappingIdOps);
616 RewriterBase &rewriter, TransformOpInterface transformOp,
619 LDBG() <<
"Start mapForallToBlocksImpl";
627 if (!
diag.succeeded())
632 Block *parentBlock = forallOp->getBlock();
644 rewriter, transformOp, forallOp,
645 gridDims, rewriteResult, gpuIdBuilder);
649 if (!
diag.succeeded())
653 if (gridDims.empty()) {
655 while (gridDims.size() < 3)
656 gridDims.push_back(1);
658 assert(gridDims.size() == 3 &&
"Need 3-D gridDims");
670 scf::ForallOp &topLevelForallOp,
671 TransformOpInterface transformOp) {
672 auto walkResult =
target->walk([&](scf::ForallOp forallOp) {
673 if (forallOp->getParentOfType<scf::ForallOp>())
675 if (topLevelForallOp)
678 topLevelForallOp = forallOp;
682 if (walkResult.wasInterrupted() || !topLevelForallOp)
683 return transformOp.emitSilenceableError()
684 <<
"could not find a unique topLevel scf.forall";
691 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(
target);
692 auto transformOp = cast<TransformOpInterface>(getOperation());
694 if (!getGenerateGpuLaunch() && !gpuLaunch) {
696 emitSilenceableError()
697 <<
"Given target is not gpu.launch, set `generate_gpu_launch` "
699 diag.attachNote(
target->getLoc()) <<
"when applied to this payload op";
703 scf::ForallOp topLevelForallOp;
705 target, topLevelForallOp, transformOp);
706 if (!
diag.succeeded()) {
707 diag.attachNote(
target->getLoc()) <<
"when applied to this payload op";
710 assert(topLevelForallOp &&
"expect an scf.forall");
713 if (!getGenerateGpuLaunch() && gridDims.size() != 3)
714 return transformOp.emitDefiniteFailure(
"transform require size-3 mapping");
720 if (getGenerateGpuLaunch()) {
723 if (!
diag.succeeded())
728 rewriter.
eraseOp(topLevelForallOp);
729 topLevelForallOp = cast<scf::ForallOp>(newForallOp);
733 bool useLinearMapping =
false;
734 if (topLevelForallOp.getMapping())
735 useLinearMapping = topLevelForallOp.usesLinearMapping();
737 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
738 topLevelForallOp.getDeviceMaskingAttr();
739 assert(succeeded(maybeMaskingAttr) &&
"unexpected failed maybeMaskingAttr");
740 assert((!*maybeMaskingAttr || useLinearMapping) &&
741 "masking requires linear mapping");
747 rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
748 if (!
diag.succeeded())
754 cast<TransformOpInterface>(getOperation()), gridDims[0],
755 gridDims[1], gridDims[2]);
761LogicalResult transform::MapForallToBlocks::verify() {
762 if (!getGridDims().empty() && getGridDims().size() != 3) {
763 return emitOpError() <<
"transform requires empty or size-3 grid_dims";
773 std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
775 int factor,
bool useLinearMapping =
false) {
776 if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
778 transformOp, forallOp,
779 Twine(
"3-D mapping: size of threadIdx.x must be a multiple of ") +
786 transformOp, forallOp,
787 Twine(
"the number of required parallel resources (blocks or "
790 " overflows the number of available resources " +
801 DeviceMappingAttrInterface mappingAttr =
802 forallOp.getDeviceMappingAttrs().front();
803 bool useLinearMapping = mappingAttr.isLinearMapping();
806 auto numParallelIterations =
808 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
810 transformOp, forallOp,
811 "requires statically sized, normalized forall op");
814 if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
816 }
else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
821 blockSizes, factor, useLinearMapping);
822 if (!
diag.succeeded())
825 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
826 forallOp.getDeviceMaskingAttr();
827 assert(succeeded(maybeMaskingAttr) &&
"unexpected failed maybeMaskingAttr");
828 assert((!*maybeMaskingAttr || useLinearMapping) &&
829 "masking requires linear mapping");
835 .Case([&](GPUWarpgroupMappingAttr) {
839 .Case([&](GPUWarpMappingAttr) {
843 .Case([&](GPUThreadMappingAttr) {
846 .Case([&](GPULaneMappingAttr) {
850 .DefaultUnreachable(
"unknown mapping attribute");
855 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
857 bool syncAfterDistribute) {
865 if (!
diag.succeeded())
873 transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
874 if (!
diag.succeeded())
884 rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
885 if (!
diag.succeeded())
888 if (syncAfterDistribute)
889 BarrierOp::create(rewriter, loc);
895 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
897 bool syncAfterDistribute) {
898 LDBG() <<
"Start mapNestedForallToThreadsImpl";
899 if (blockDims.size() != 3) {
901 "requires size-3 thread mapping");
910 rewriter, transformOp, forallOp, blockDims, warpSize,
911 syncAfterDistribute);
912 if (
diag.isDefiniteFailure())
914 if (
diag.succeeded())
932 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(
target);
933 auto transformOp = cast<TransformOpInterface>(getOperation());
937 return emitSilenceableError() <<
"Given target is not a gpu.launch";
942 checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
943 blockDims[0], blockDims[1], blockDims[2]);
944 if (
diag.isSilenceableFailure()) {
945 diag.attachNote(getLoc()) << getBlockDimsAttrName() <<
" is too large";
952 std::nullopt, std::nullopt, blockDims[0], blockDims[1],
958 getWarpSize(), getSyncAfterDistribute());
960 results.
push_back(gpuLaunch.getOperation());
971class GPUTransformDialectExtension
973 GPUTransformDialectExtension> {
977 GPUTransformDialectExtension() {
978 declareGeneratedDialect<GPUDialect>();
979 declareGeneratedDialect<amdgpu::AMDGPUDialect>();
980 declareGeneratedDialect<arith::ArithDialect>();
981 declareGeneratedDialect<scf::SCFDialect>();
982 registerTransformOps<
984#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
990#define GET_OP_CLASSES
991#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
p<< " : "<< getMemRefType()<< ", "<< getType();}static LogicalResult verifyVectorMemoryOp(Operation *op, MemRefType memrefType, VectorType vectorType) { if(memrefType.getElementType() !=vectorType.getElementType()) return op-> emitOpError("requires memref and vector types of the same elemental type")
Given a list of lists of parsed operands, populates uniqueOperands with unique operands.
*if copies could not be generated due to yet unimplemented cases *copyInPlacementStart and copyOutPlacementStart in copyPlacementBlock *specify the insertion points where the incoming copies and outgoing should be the output argument nBegin is set to its * replacement(set to `begin` if no invalidation happens). Since outgoing *copies could have been inserted at `end`
static std::string diag(const llvm::Value &value)
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Base type for affine expression.
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
OpListType::iterator iterator
OpListType & getOperations()
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
bool succeeded() const
Returns true if this is a success.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
void addExtensions()
Add the given extensions to the registry.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Conversion from types to the LLVM IR dialect.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Operation is the basic unit of execution within MLIR.
result_type_range getResultTypes()
user_range getUsers()
Returns a range of all users.
unsigned getNumResults()
Return the number of results held by this operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
virtual void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
void registerTransformDialectExtension(DialectRegistry ®istry)
uint64_t getN(LevelType lt)
uint64_t getM(LevelType lt)
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
llvm::DenseSet< ValueT, ValueInfoT > DenseSet
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
llvm::SetVector< T, Vector, Set, N > SetVector
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
const FrozenRewritePatternSet & patterns
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
llvm::TypeSwitch< T, ResultT > TypeSwitch
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
std::optional< SmallVector< int64_t > > getConstantIntValues(ArrayRef< OpFoldResult > ofrs)
If all ofrs are constant integers or IntegerAttrs, return the integers.
SmallVector< Value > getValuesSortedByKey(ArrayRef< Attribute > keys, ArrayRef< Value > values, llvm::function_ref< bool(Attribute, Attribute)> compare)
Helper to sort values according to matching keys.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns)
Erase barriers that do not enforce conflicting memory side effects.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Struct to return the result of the rewrite of a forall operation.
SmallVector< Value > mappingIds
SmallVector< int64_t > mappingSizes
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Options that control the vector unrolling.