doxygen/GPUTransformOps_8cpp_source.html

 //===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"


 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"

 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"

 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"

 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"

 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"

 #include "mlir/Dialect/AMDGPU/Utils/Chipset.h"

 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/GPU/IR/GPUDialect.h"

 #include "mlir/Dialect/GPU/TransformOps/Utils.h"

 #include "mlir/Dialect/GPU/Transforms/Passes.h"

 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"

 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"

 #include "mlir/Dialect/MemRef/IR/MemRef.h"

 #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"

 #include "mlir/Dialect/SCF/IR/SCF.h"

 #include "mlir/Dialect/Transform/IR/TransformDialect.h"

 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"

 #include "mlir/Dialect/Utils/IndexingUtils.h"

 #include "mlir/Dialect/Vector/IR/VectorOps.h"

 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"

 #include "mlir/IR/AffineExpr.h"

 #include "mlir/IR/Builders.h"

 #include "mlir/IR/BuiltinAttributes.h"

 #include "mlir/IR/IRMapping.h"

 #include "mlir/IR/MLIRContext.h"

 #include "mlir/IR/OpDefinition.h"

 #include "mlir/IR/Visitors.h"

 #include "mlir/Support/LLVM.h"

 #include "mlir/Transforms/DialectConversion.h"

 #include "llvm/ADT/STLExtras.h"

 #include "llvm/ADT/SmallVector.h"

 #include "llvm/ADT/TypeSwitch.h"

 #include "llvm/Support/DebugLog.h"

 #include "llvm/Support/ErrorHandling.h"

 #include "llvm/Support/InterleavedRange.h"

 #include "llvm/Support/LogicalResult.h"

 #include <optional>

 #include <type_traits>


 using namespace mlir;

 using namespace mlir::gpu;

 using namespace mlir::transform;

 using namespace mlir::transform::gpu;


 #define DEBUG_TYPE "gpu-transforms"


 //===----------------------------------------------------------------------===//

 // Apply...ConversionPatternsOp

 //===----------------------------------------------------------------------===//


 void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(

     TypeConverter &typeConverter, RewritePatternSet &patterns) {

   auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);

   // NVVM uses alloca in the default address space to represent private

   // memory allocations, so drop private annotations. NVVM uses address

   // space 3 for shared memory. NVVM uses the default address space to

   // represent global memory.

   // Used in populateGpuToNVVMConversionPatternsso attaching here for now.

   // TODO: We should have a single to_nvvm_type_converter.

   populateGpuMemorySpaceAttributeConversions(

       llvmTypeConverter, [](AddressSpace space) -> unsigned {

         switch (space) {

         case AddressSpace::Global:

           return static_cast<unsigned>(NVVM::NVVMMemorySpace::Global);

         case AddressSpace::Workgroup:

           return static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared);

         case AddressSpace::Private:

           return 0;

         }

         llvm_unreachable("unknown address space enum value");

         return static_cast<unsigned>(NVVM::NVVMMemorySpace::Generic);

       });

   // Used in GPUToNVVM/WmmaOpsToNvvm.cpp so attaching here for now.

   // TODO: We should have a single to_nvvm_type_converter.

   llvmTypeConverter.addConversion(

       [&](MMAMatrixType type) -> Type { return convertMMAToLLVMType(type); });

   // Set higher benefit, so patterns will run before generic LLVM lowering.

   populateGpuToNVVMConversionPatterns(llvmTypeConverter, patterns,

                                       getBenefit());

 }


 LogicalResult

 transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(

     transform::TypeConverterBuilderOpInterface builder) {

   if (builder.getTypeConverterType() != "LLVMTypeConverter")

     return emitOpError("expected LLVMTypeConverter");

   return success();

 }


 void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(

     TypeConverter &typeConverter, RewritePatternSet &patterns) {

   auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);

   populateGpuWMMAToNVVMConversionPatterns(llvmTypeConverter, patterns);

 }


 LogicalResult

 transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(

     transform::TypeConverterBuilderOpInterface builder) {

   if (builder.getTypeConverterType() != "LLVMTypeConverter")

     return emitOpError("expected LLVMTypeConverter");

   return success();

 }


 void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::

     populatePatterns(TypeConverter &typeConverter,

                      RewritePatternSet &patterns) {

   auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);

   populateGpuSubgroupReduceOpLoweringPattern(llvmTypeConverter, patterns);

 }


 LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::

     verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {

   if (builder.getTypeConverterType() != "LLVMTypeConverter")

     return emitOpError("expected LLVMTypeConverter");

   return success();

 }


 void transform::ApplyGPUToROCDLConversionPatternsOp::populatePatterns(

     TypeConverter &typeConverter, RewritePatternSet &patterns) {

   auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);

   populateGpuMemorySpaceAttributeConversions(

       llvmTypeConverter, [](AddressSpace space) {

         switch (space) {

         case AddressSpace::Global:

           return ROCDL::ROCDLDialect::kGlobalMemoryAddressSpace;

         case AddressSpace::Workgroup:

           return ROCDL::ROCDLDialect::kSharedMemoryAddressSpace;

         case AddressSpace::Private:

           return ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace;

         }

         llvm_unreachable("unknown address space enum value");

       });

   FailureOr<amdgpu::Chipset> maybeChipset =

       amdgpu::Chipset::parse(getChipset());

   assert(llvm::succeeded(maybeChipset) && "expected valid chipset");

   populateGpuToROCDLConversionPatterns(

       llvmTypeConverter, patterns, mlir::gpu::amd::Runtime::HIP, *maybeChipset);

 }


 LogicalResult

 transform::ApplyGPUToROCDLConversionPatternsOp::verifyTypeConverter(

     transform::TypeConverterBuilderOpInterface builder) {

   FailureOr<amdgpu::Chipset> maybeChipset =

       amdgpu::Chipset::parse(getChipset());

   if (failed(maybeChipset)) {

     return emitOpError("Invalid chipset name: " + getChipset());

   }

   if (builder.getTypeConverterType() != "LLVMTypeConverter")

     return emitOpError("expected LLVMTypeConverter");

   return success();

 }


 //===----------------------------------------------------------------------===//

 // Apply...PatternsOp

 //===----------------------------------------------------------------------===//s


 void ApplyGPURewritePatternsOp::populatePatterns(RewritePatternSet &patterns) {

   populateGpuRewritePatterns(patterns);

 }


 void transform::ApplyGPUPromoteShuffleToAMDGPUPatternsOp::populatePatterns(

     RewritePatternSet &patterns) {

   std::optional<StringRef> chipsetName = getChipset();

   std::optional<amdgpu::Chipset> maybeChipset;

   if (chipsetName) {

     FailureOr<amdgpu::Chipset> parsedChipset =

         amdgpu::Chipset::parse(*chipsetName);

     assert(llvm::succeeded(parsedChipset) && "expected valid chipset");

     maybeChipset = parsedChipset;

   }


   populateGpuPromoteShuffleToAMDGPUPatterns(patterns, maybeChipset);

 }


 //===----------------------------------------------------------------------===//

 // ApplyUnrollVectorsSubgroupMmaOp

 //===----------------------------------------------------------------------===//


 /// Pick an unrolling order that will allow tensorcore operation to reuse LHS

 /// register.

 static std::optional<SmallVector<int64_t>>

 gpuMmaUnrollOrder(vector::ContractionOp contract) {

   SmallVector<int64_t> order;

   // First make reduction the outer dimensions.

   for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {

     if (vector::isReductionIterator(iter)) {

       order.push_back(index);

     }

   }


   llvm::SmallDenseSet<int64_t> dims;

   for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) {

     dims.insert(cast<AffineDimExpr>(expr).getPosition());

   }

   // Then parallel dimensions that are part of Lhs as we want to re-use Lhs.

   for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {

     if (vector::isParallelIterator(iter) && dims.count(index)) {

       order.push_back(index);

     }

   }

   // Then the remaining parallel loops.

   for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {

     if (vector::isParallelIterator(iter) && !dims.count(index)) {

       order.push_back(index);

     }

   }

   return order;

 }


 /// Returns the target vector size for the target operation based on the native

 /// vector size specified with `m`, `n`, and `k`.

 static std::optional<SmallVector<int64_t>>

 getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k) {

   if (auto contract = dyn_cast<vector::ContractionOp>(op)) {

     int64_t contractRank = contract.getIteratorTypes().size();

     if (contractRank < 3)

       return std::nullopt;

     SmallVector<int64_t> nativeSize(contractRank - 3, 1);

     nativeSize.append({m, n, k});

     return nativeSize;

   }

   if (auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {

     int64_t writeRank = writeOp.getVectorType().getRank();

     if (writeRank < 2)

       return std::nullopt;

     SmallVector<int64_t> nativeSize(writeRank - 2, 1);

     nativeSize.append({m, n});

     return nativeSize;

   }

   if (auto readOp = dyn_cast<vector::TransferReadOp>(op)) {

     // Transfer read ops may need different shapes based on how they are being

     // used. For simplicity just match the shape used by the extract strided op.

     VectorType sliceType;

     for (Operation *users : op->getUsers()) {

       auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);

       if (!extract)

         return std::nullopt;

       auto vecType = cast<VectorType>(extract.getResult().getType());

       if (sliceType && sliceType != vecType)

         return std::nullopt;

       sliceType = vecType;

     }

     return llvm::to_vector(sliceType.getShape());

   }

   if ((OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)) {

     if (auto vecType = dyn_cast<VectorType>(op->getResultTypes()[0])) {

       // TODO: The condition for unrolling elementwise should be restricted

       // only to operations that need unrolling (connected to the contract).

       if (vecType.getRank() < 2)

         return std::nullopt;


       // First check whether there is a slice to infer the shape from. This is

       // required for cases where the accumulator type differs from the input

       // types, in which case we will see an `arith.ext_` between the contract

       // and transfer_read which needs to be unrolled.

       VectorType sliceType;

       for (Operation *users : op->getUsers()) {

         auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);

         if (!extract)

           return std::nullopt;

         auto vecType = cast<VectorType>(extract.getResult().getType());

         if (sliceType && sliceType != vecType)

           return std::nullopt;

         sliceType = vecType;

       }

       if (sliceType)

         return llvm::to_vector(sliceType.getShape());


       // Else unroll for trailing elementwise.

       SmallVector<int64_t> nativeSize(vecType.getRank() - 2, 1);

       // Map elementwise ops to the output shape.

       nativeSize.append({m, n});

       return nativeSize;

     }

   }

   return std::nullopt;

 }


 void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(

     RewritePatternSet &patterns) {

   auto unrollOrder = [](Operation *op) -> std::optional<SmallVector<int64_t>> {

     auto contract = dyn_cast<vector::ContractionOp>(op);

     if (!contract)

       return std::nullopt;

     return gpuMmaUnrollOrder(contract);

   };


   int64_t m = getM();

   int64_t n = getN();

   int64_t k = getK();

   auto nativeShapeFn =

       [m, n, k](Operation *op) -> std::optional<SmallVector<int64_t>> {

     return getSubgroupMmaNativeVectorSize(op, m, n, k);

   };

   vector::populateVectorUnrollPatterns(

       patterns, vector::UnrollVectorOptions()

                     .setNativeShapeFn(nativeShapeFn)

                     .setUnrollTraversalOrderFn(unrollOrder));

 }


 //===----------------------------------------------------------------------===//

 // EliminateBarriersOp

 //===----------------------------------------------------------------------===//


 void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {

   populateGpuEliminateBarriersPatterns(patterns);

 }


 //===----------------------------------------------------------------------===//

 // Block and thread mapping utilities.

 //===----------------------------------------------------------------------===//


 namespace {

 /// Local types used for mapping verification.

 struct MappingKind {};

 struct BlockMappingKind : MappingKind {};

 struct ThreadMappingKind : MappingKind {};

 } // namespace


 static DiagnosedSilenceableFailure

 definiteFailureHelper(std::optional<TransformOpInterface> transformOp,

                       Operation *target, const Twine &message) {

   if (transformOp.has_value())

     return transformOp->emitDefiniteFailure() << message;

   return emitDefiniteFailure(target, message);

 }


 /// Check if given mapping attributes are one of the desired attributes

 template <typename MappingKindType>

 static DiagnosedSilenceableFailure

 checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,

                            scf::ForallOp forallOp) {

   if (!forallOp.getMapping().has_value()) {

     return definiteFailureHelper(transformOp, forallOp,

                                  "scf.forall op requires a mapping attribute");

   }


   bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),

                                       llvm::IsaPred<GPUBlockMappingAttr>);

   bool hasWarpgroupMapping = llvm::any_of(

       forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);

   bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),

                                      llvm::IsaPred<GPUWarpMappingAttr>);

   bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),

                                        llvm::IsaPred<GPUThreadMappingAttr>);

   bool hasLaneMapping = llvm::any_of(forallOp.getMapping().value(),

                                      llvm::IsaPred<GPULaneMappingAttr>);

   int64_t countMappingTypes = 0;

   countMappingTypes += hasBlockMapping ? 1 : 0;

   countMappingTypes += hasWarpgroupMapping ? 1 : 0;

   countMappingTypes += hasWarpMapping ? 1 : 0;

   countMappingTypes += hasThreadMapping ? 1 : 0;

   countMappingTypes += hasLaneMapping ? 1 : 0;

   if (countMappingTypes > 1) {

     return definiteFailureHelper(

         transformOp, forallOp,

         "cannot mix different mapping types, use nesting");

   }

   if (std::is_same<MappingKindType, BlockMappingKind>::value &&

       !hasBlockMapping) {

     return definiteFailureHelper(

         transformOp, forallOp,

         "scf.forall op requires a mapping attribute of kind 'block'");

   }

   if (std::is_same<MappingKindType, ThreadMappingKind>::value &&

       !hasLaneMapping && !hasThreadMapping && !hasWarpMapping &&

       !hasWarpgroupMapping) {

     return definiteFailureHelper(transformOp, forallOp,

                                  "scf.forall op requires a mapping attribute "

                                  "of kind 'thread' or 'warp'");

   }


   DenseSet<Attribute> seen;

   for (Attribute map : forallOp.getMapping()->getValue()) {

     if (seen.contains(map)) {

       return definiteFailureHelper(

           transformOp, forallOp,

           "duplicate attribute, cannot map different loops "

           "to the same mapping id");

     }

     seen.insert(map);

   }


   auto isLinear = [](DeviceMappingAttrInterface attr) {

     return attr.isLinearMapping();

   };

   if (llvm::any_of(forallOp.getDeviceMappingAttrs(), isLinear) &&

       !llvm::all_of(forallOp.getDeviceMappingAttrs(), isLinear)) {

     return definiteFailureHelper(

         transformOp, forallOp,

         "cannot mix linear and non-linear mapping modes");

   }


   FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =

       forallOp.getDeviceMaskingAttr();

   if (succeeded(maybeMaskingAttr) && *maybeMaskingAttr &&

       !forallOp.usesLinearMapping()) {

     return definiteFailureHelper(

         transformOp, forallOp,

         "device masking is only available in linear mapping mode");

   }


   return DiagnosedSilenceableFailure::success();

 }


 template <typename MappingKindType>

 static DiagnosedSilenceableFailure

 verifyGpuMapping(std::optional<TransformOpInterface> transformOp,

                  scf::ForallOp forallOp) {

   // Check the types of the mapping attributes match.

   DiagnosedSilenceableFailure typeRes =

       checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);

   if (!typeRes.succeeded())

     return typeRes;


   // Perform other non-types verifications.

   if (!forallOp.isNormalized())

     return definiteFailureHelper(transformOp, forallOp,

                                  "unsupported non-normalized loops");

   if (forallOp.getNumResults() > 0)

     return definiteFailureHelper(transformOp, forallOp,

                                  "only bufferized scf.forall can be mapped");

   bool useLinearMapping = forallOp.usesLinearMapping();

   // TODO: This would be more natural with support for Optional<EnumParameter>

   // in GPUDeviceMappingAttr.

   int64_t maxNumMappingsSupported =

       useLinearMapping ? (getMaxEnumValForMappingId() -

                           static_cast<uint64_t>(MappingId::DimZ))

                        : 3;

   if (forallOp.getRank() > maxNumMappingsSupported) {

     return definiteFailureHelper(transformOp, forallOp,

                                  "scf.forall with rank > ")

            << maxNumMappingsSupported

            << " does not lower for the specified mapping attribute type";

   }

   auto numParallelIterations =

       getConstantIntValues(forallOp.getMixedUpperBound());

   if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {

     return definiteFailureHelper(

         transformOp, forallOp,

         "requires statically sized, normalized forall op");

   }

   return DiagnosedSilenceableFailure::success();

 }


 /// Struct to return the result of the rewrite of a forall operation.

 struct ForallRewriteResult {

   SmallVector<int64_t> mappingSizes;

   SmallVector<Value> mappingIds;

 };


 /// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.

 template <typename OpTy, typename OperationOrBlock>

 static void

 replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc,

                             OperationOrBlock *parent, Value replacement,

                             ArrayRef<int64_t> availableMappingSizes) {

   parent->walk([&](OpTy idOp) {

     if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1)

       rewriter.replaceAllUsesWith(idOp.getResult(), replacement);

   });

 }


 static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(

     RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,

     scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,

     ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder) {

   LDBG() << "--start rewriteOneForallCommonImpl";


   // Step 1. Complete the mapping to a full mapping (with 1s) if necessary.

   auto numParallelIterations =

       getConstantIntValues(forallOp.getMixedUpperBound());

   assert(forallOp.isNormalized() && numParallelIterations.has_value() &&

          "requires statically sized, normalized forall op");

   SmallVector<int64_t> tmpMappingSizes = numParallelIterations.value();

   SmallVector<DeviceMappingAttrInterface> forallMappingAttrsVec =

       forallOp.getDeviceMappingAttrs();

   SetVector<Attribute> forallMappingAttrs;

   forallMappingAttrs.insert_range(forallMappingAttrsVec);

   auto comparator = [](Attribute a, Attribute b) -> bool {

     return cast<DeviceMappingAttrInterface>(a).getMappingId() <

            cast<DeviceMappingAttrInterface>(b).getMappingId();

   };


   // Step 1.b. In the linear case, compute the max mapping to avoid needlessly

   // mapping all dimensions. In the 3-D mapping case we need to map all

   // dimensions.

   DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(

       *llvm::max_element(forallMappingAttrs, comparator));

   DeviceMappingAttrInterface maxLinearMapping;

   if (maxMapping.isLinearMapping())

     maxLinearMapping = maxMapping;

   for (auto attr : gpuIdBuilder.mappingAttributes) {

     // If attr overflows, just skip.

     if (maxLinearMapping && comparator(maxLinearMapping, attr))

       continue;

     // Try to insert. If element was already present, just continue.

     if (!forallMappingAttrs.insert(attr))

       continue;

     // Otherwise, we have a new insertion without a size -> use size 1.

     tmpMappingSizes.push_back(1);

   }

   LDBG() << "----tmpMappingSizes extracted from scf.forall op: "

          << llvm::interleaved(tmpMappingSizes);


   // Step 2. sort the values by the corresponding DeviceMappingAttrInterface.

   SmallVector<int64_t> forallMappingSizes = getValuesSortedByKey(

       forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);

   LDBG() << "----forallMappingSizes: " << llvm::interleaved(forallMappingSizes);

   LDBG() << "----forallMappingAttrs: " << llvm::interleaved(forallMappingAttrs);


   // Step 3. Generate the mappingIdOps using the provided generator.

   Location loc = forallOp.getLoc();

   OpBuilder::InsertionGuard guard(rewriter);

   rewriter.setInsertionPoint(forallOp);

   SmallVector<int64_t> originalBasis(availableMappingSizes);

   bool originalBasisWasProvided = !originalBasis.empty();

   if (!originalBasisWasProvided) {

     LDBG() << "----originalBasis was not provided, deriving it and there will "

               "be no "

               "predication";

     originalBasis = forallMappingSizes;

     while (originalBasis.size() < 3)

       originalBasis.push_back(1);

   } else {

     LDBG() << "----originalBasis was provided, using it, there will be "

               "predication";

   }

   LDBG() << "------originalBasis: " << llvm::interleaved(originalBasis);


   IdBuilderResult builderResult =

       gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis);

   if (!builderResult.errorMsg.empty())

     return definiteFailureHelper(transformOp, forallOp, builderResult.errorMsg);


   LDBG() << builderResult;


   // Step 4. Map the induction variables to the mappingIdOps, this may involve

   // a permutation.

   SmallVector<Value> mappingIdOps = builderResult.mappingIdOps;

   IRMapping bvm;

   for (auto [iv, dim] : llvm::zip_equal(

            forallOp.getInductionVars(),

            forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {

     auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);

     Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];

     LDBG() << "----map: " << iv << " to " << peIdOp;

     bvm.map(iv, peIdOp);

   }


   // Step 5. If the originalBasis is already known, create conditionals to

   // predicate the region. Otherwise, the current forall determines the

   // originalBasis and no predication occurs.

   Value predicate;

   if (originalBasisWasProvided) {

     for (Value tmpPredicate : builderResult.predicateOps) {

       predicate = predicate ? arith::AndIOp::create(rewriter, loc, predicate,

                                                     tmpPredicate)

                             : tmpPredicate;

     }

   }


   // Step 6. Move the body of forallOp.

   // Erase the terminator first, it will not be used.

   rewriter.eraseOp(forallOp.getTerminator());

   Block *targetBlock;

   Block::iterator insertionPoint;

   if (predicate) {

     // Step 6.a. If predicated, move at the beginning.

     auto ifOp = scf::IfOp::create(rewriter, loc, predicate,

                                   /*withElseRegion=*/false);

     targetBlock = ifOp.thenBlock();

     insertionPoint = ifOp.thenBlock()->begin();

   } else {

     // Step 6.b. Otherwise, move inline just at the rewriter insertion

     // point.

     targetBlock = forallOp->getBlock();

     insertionPoint = rewriter.getInsertionPoint();

   }

   Block &sourceBlock = forallOp.getRegion().front();

   targetBlock->getOperations().splice(insertionPoint,

                                       sourceBlock.getOperations());


   // Step 7. RAUW indices.

   for (Value loopIndex : forallOp.getInductionVars()) {

     Value threadIdx = bvm.lookup(loopIndex);

     rewriter.replaceAllUsesWith(loopIndex, threadIdx);

   }


   // Step 8. Erase old op.

   rewriter.eraseOp(forallOp);


   LDBG() << "----result forallMappingSizes: "

          << llvm::interleaved(forallMappingSizes);

   LDBG() << "----result mappingIdOps: " << llvm::interleaved(mappingIdOps);


   result = ForallRewriteResult{forallMappingSizes, mappingIdOps};

   return DiagnosedSilenceableFailure::success();

 }


 //===----------------------------------------------------------------------===//

 // MapForallToBlocks

 //===----------------------------------------------------------------------===//


 DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(

     RewriterBase &rewriter, TransformOpInterface transformOp,

     scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,

     const GpuIdBuilder &gpuIdBuilder) {

   LDBG() << "Start mapForallToBlocksImpl";


   {

     // GPU-specific verifications. There is no better place to anchor

     // those right now: the ForallOp is target-independent and the transform

     // op does not apply to individual ForallOp.

     DiagnosedSilenceableFailure diag =

         verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);

     if (!diag.succeeded())

       return diag;

   }


   Location loc = forallOp.getLoc();

   Block *parentBlock = forallOp->getBlock();

   Value zero;

   {

     // Create an early zero index value for replacements and immediately reset

     // the insertion point.

     OpBuilder::InsertionGuard guard(rewriter);

     rewriter.setInsertionPointToStart(parentBlock);

     zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

   }


   ForallRewriteResult rewriteResult;

   DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl(

       rewriter, transformOp, forallOp,

       /*availableMappingSizes=*/gridDims, rewriteResult, gpuIdBuilder);


   // Return if anything goes wrong, use silenceable failure as a match

   // failure.

   if (!diag.succeeded())

     return diag;


   // If gridDims was not provided already, set it from the return.

   if (gridDims.empty()) {

     gridDims = rewriteResult.mappingSizes;

     while (gridDims.size() < 3)

       gridDims.push_back(1);

   }

   assert(gridDims.size() == 3 && "Need 3-D gridDims");


   // Replace ids of dimensions known to be 1 by 0 to simplify the IR.

   // Here, the result of mapping determines the available mapping sizes.

   replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,

                                           rewriteResult.mappingSizes);


   return DiagnosedSilenceableFailure::success();

 }


 DiagnosedSilenceableFailure

 mlir::transform::gpu::findTopLevelForallOp(Operation *target,

                                            scf::ForallOp &topLevelForallOp,

                                            TransformOpInterface transformOp) {

   auto walkResult = target->walk([&](scf::ForallOp forallOp) {

     if (forallOp->getParentOfType<scf::ForallOp>())

       return WalkResult::advance();

     if (topLevelForallOp)

       // TODO: Handle multiple forall if they are independent.

       return WalkResult::interrupt();

     topLevelForallOp = forallOp;

     return WalkResult::advance();

   });


   if (walkResult.wasInterrupted() || !topLevelForallOp)

     return transformOp.emitSilenceableError()

            << "could not find a unique topLevel scf.forall";

   return DiagnosedSilenceableFailure::success();

 }


 DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(

     transform::TransformRewriter &rewriter, Operation *target,

     ApplyToEachResultList &results, transform::TransformState &state) {

   LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);

   auto transformOp = cast<TransformOpInterface>(getOperation());


   if (!getGenerateGpuLaunch() && !gpuLaunch) {

     DiagnosedSilenceableFailure diag =

         emitSilenceableError()

         << "Given target is not gpu.launch, set `generate_gpu_launch` "

            "attribute";

     diag.attachNote(target->getLoc()) << "when applied to this payload op";

     return diag;

   }


   scf::ForallOp topLevelForallOp;

   DiagnosedSilenceableFailure diag = mlir::transform::gpu::findTopLevelForallOp(

       target, topLevelForallOp, transformOp);

   if (!diag.succeeded()) {

     diag.attachNote(target->getLoc()) << "when applied to this payload op";

     return diag;

   }

   assert(topLevelForallOp && "expect an scf.forall");


   SmallVector<int64_t> gridDims{getGridDims()};

   if (!getGenerateGpuLaunch() && gridDims.size() != 3)

     return transformOp.emitDefiniteFailure("transform require size-3 mapping");


   OpBuilder::InsertionGuard guard(rewriter);

   rewriter.setInsertionPoint(topLevelForallOp);


   // Generate gpu launch here and move the forall inside

   if (getGenerateGpuLaunch()) {

     DiagnosedSilenceableFailure diag =

         createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);

     if (!diag.succeeded())

       return diag;


     rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());

     Operation *newForallOp = rewriter.clone(*topLevelForallOp);

     rewriter.eraseOp(topLevelForallOp);

     topLevelForallOp = cast<scf::ForallOp>(newForallOp);

   }


   // The BlockIdBuilder adapts to whatever is thrown at it.

   bool useLinearMapping = false;

   if (topLevelForallOp.getMapping())

     useLinearMapping = topLevelForallOp.usesLinearMapping();


   FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =

       topLevelForallOp.getDeviceMaskingAttr();

   assert(succeeded(maybeMaskingAttr) && "unexpected failed maybeMaskingAttr");

   assert((!*maybeMaskingAttr || useLinearMapping) &&

          "masking requires linear mapping");


   GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), useLinearMapping,

                                       *maybeMaskingAttr);


   diag = mlir::transform::gpu::mapForallToBlocksImpl(

       rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);

   if (!diag.succeeded())

     return diag;


   // Set the GPU launch configuration for the grid dims late, this is

   // subject to IR inspection.

   diag = alterGpuLaunch(rewriter, gpuLaunch,

                         cast<TransformOpInterface>(getOperation()), gridDims[0],

                         gridDims[1], gridDims[2]);


   results.push_back(gpuLaunch);

   return diag;

 }


 LogicalResult transform::MapForallToBlocks::verify() {

   if (!getGridDims().empty() && getGridDims().size() != 3) {

     return emitOpError() << "transform requires empty or size-3 grid_dims";

   }

   return success();

 }


 //===----------------------------------------------------------------------===//

 // MapNestedForallToThreads

 //===----------------------------------------------------------------------===//


 static DiagnosedSilenceableFailure checkMappingSpec(

     std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,

     ArrayRef<int64_t> numParallelIterations, ArrayRef<int64_t> blockOrGridSizes,

     int factor, bool useLinearMapping = false) {

   if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {

     auto diag = definiteFailureHelper(

         transformOp, forallOp,

         Twine("3-D mapping: size of threadIdx.x must be a multiple of ") +

             Twine(factor));

     return diag;

   }

   if (computeProduct(numParallelIterations) * factor >

       computeProduct(blockOrGridSizes)) {

     auto diag = definiteFailureHelper(

         transformOp, forallOp,

         Twine("the number of required parallel resources (blocks or "

               "threads) ") +

             Twine(computeProduct(numParallelIterations) * factor) +

             " overflows the number of available resources " +

             Twine(computeProduct(blockOrGridSizes)));

     return diag;

   }

   return DiagnosedSilenceableFailure::success();

 }


 static DiagnosedSilenceableFailure

 getThreadIdBuilder(std::optional<TransformOpInterface> transformOp,

                    scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes,

                    int64_t warpSize, GpuIdBuilder &gpuIdBuilder) {

   DeviceMappingAttrInterface mappingAttr =

       forallOp.getDeviceMappingAttrs().front();

   bool useLinearMapping = mappingAttr.isLinearMapping();


   // Sanity checks that may result in runtime verification errors.

   auto numParallelIterations =

       getConstantIntValues((forallOp.getMixedUpperBound()));

   if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {

     return definiteFailureHelper(

         transformOp, forallOp,

         "requires statically sized, normalized forall op");

   }

   int64_t factor = 1;

   if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {

     factor = GpuWarpgroupIdBuilder::kNumWarpsPerGroup * warpSize;

   } else if (isa<GPUWarpMappingAttr>(mappingAttr)) {

     factor = warpSize;

   }

   DiagnosedSilenceableFailure diag =

       checkMappingSpec(transformOp, forallOp, numParallelIterations.value(),

                        blockSizes, factor, useLinearMapping);

   if (!diag.succeeded())

     return diag;


   FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =

       forallOp.getDeviceMaskingAttr();

   assert(succeeded(maybeMaskingAttr) && "unexpected failed maybeMaskingAttr");

   assert((!*maybeMaskingAttr || useLinearMapping) &&

          "masking requires linear mapping");


   // Start mapping.

   MLIRContext *ctx = forallOp.getContext();

   gpuIdBuilder =

       TypeSwitch<DeviceMappingAttrInterface, GpuIdBuilder>(mappingAttr)

           .Case([&](GPUWarpgroupMappingAttr) {

             return GpuWarpgroupIdBuilder(ctx, warpSize, useLinearMapping,

                                          *maybeMaskingAttr);

           })

           .Case([&](GPUWarpMappingAttr) {

             return GpuWarpIdBuilder(ctx, warpSize, useLinearMapping,

                                     *maybeMaskingAttr);

           })

           .Case([&](GPUThreadMappingAttr) {

             return GpuThreadIdBuilder(ctx, useLinearMapping, *maybeMaskingAttr);

           })

           .Case([&](GPULaneMappingAttr) {

             return GpuLaneIdBuilder(ctx, warpSize, useLinearMapping,

                                     *maybeMaskingAttr);

           })

           .DefaultUnreachable("unknown mapping attribute");

   return DiagnosedSilenceableFailure::success();

 }


 DiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl(

     RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,

     scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes, int64_t warpSize,

     bool syncAfterDistribute) {


   {

     // GPU-specific verifications. There is no better place to anchor

     // those right now: the ForallOp is target-independent and the transform

     // op does not apply to individual ForallOp.

     DiagnosedSilenceableFailure diag =

         verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);

     if (!diag.succeeded())

       return diag;

   }


   GpuIdBuilder gpuIdBuilder;

   {

     // Try to construct the id builder, if it fails, return.

     DiagnosedSilenceableFailure diag = getThreadIdBuilder(

         transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);

     if (!diag.succeeded())

       return diag;

   }


   Location loc = forallOp.getLoc();

   OpBuilder::InsertionGuard g(rewriter);

   // Insert after to allow for syncthreads after `forall` is erased.

   rewriter.setInsertionPointAfter(forallOp);

   ForallRewriteResult rewriteResult;

   DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl(

       rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);

   if (!diag.succeeded())

     return diag;

   // Add a syncthreads if needed. TODO: warpsync

   if (syncAfterDistribute)

     BarrierOp::create(rewriter, loc);


   return DiagnosedSilenceableFailure::success();

 }


 DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(

     RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,

     Operation *target, ArrayRef<int64_t> blockDims, int64_t warpSize,

     bool syncAfterDistribute) {

   LDBG() << "Start mapNestedForallToThreadsImpl";

   if (blockDims.size() != 3) {

     return definiteFailureHelper(transformOp, target,

                                  "requires size-3 thread mapping");

   }


   // Create an early zero index value for replacements.

   Location loc = target->getLoc();

   Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

   DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();

   WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {

     diag = mlir::transform::gpu::mapOneForallToThreadsImpl(

         rewriter, transformOp, forallOp, blockDims, warpSize,

         syncAfterDistribute);

     if (diag.isDefiniteFailure())

       return WalkResult::interrupt();

     if (diag.succeeded())

       return WalkResult::skip();

     return WalkResult::advance();

   });

   if (walkResult.wasInterrupted())

     return diag;


   // Replace ids of dimensions known to be 1 by 0 to simplify the IR.

   // Here, the result of mapping determines the available mapping sizes.

   replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,

                                           blockDims);


   return DiagnosedSilenceableFailure::success();

 }


 DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(

     transform::TransformRewriter &rewriter, Operation *target,

     ApplyToEachResultList &results, TransformState &state) {

   LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);

   auto transformOp = cast<TransformOpInterface>(getOperation());


   // Basic high-level verifications.

   if (!gpuLaunch)

     return emitSilenceableError() << "Given target is not a gpu.launch";


   // Mapping to block ids.

   SmallVector<int64_t> blockDims{getBlockDims()};

   DiagnosedSilenceableFailure diag =

       checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,

                      blockDims[0], blockDims[1], blockDims[2]);

   if (diag.isSilenceableFailure()) {

     diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large";

     return diag;

   }


   // Set the GPU launch configuration for the block dims early, this is not

   // subject to IR inspection.

   diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,

                         std::nullopt, std::nullopt, blockDims[0], blockDims[1],

                         blockDims[2]);


   rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());

   diag =

       mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims,

                                    getWarpSize(), getSyncAfterDistribute());


   results.push_back(gpuLaunch.getOperation());

   return diag;

 }


 //===----------------------------------------------------------------------===//

 // Transform op registration

 //===----------------------------------------------------------------------===//


 namespace {

 /// Registers new ops and declares PDL as dependent dialect since the

 /// additional ops are using PDL types for operands and results.

 class GPUTransformDialectExtension

     : public transform::TransformDialectExtension<

           GPUTransformDialectExtension> {

 public:

   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GPUTransformDialectExtension)


   GPUTransformDialectExtension() {

     declareGeneratedDialect<GPUDialect>();

     declareGeneratedDialect<amdgpu::AMDGPUDialect>();

     declareGeneratedDialect<arith::ArithDialect>();

     declareGeneratedDialect<scf::SCFDialect>();

     registerTransformOps<

 #define GET_OP_LIST

 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"

         >();

   }

 };

 } // namespace


 #define GET_OP_CLASSES

 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"


 void mlir::gpu::registerTransformDialectExtension(DialectRegistry &registry) {

   registry.addExtensions<GPUTransformDialectExtension>();

 }

AMDGPUDialect.h

Builders.h

Chipset.h

DeviceMappingInterface.h

DialectConversion.h

Utils.h

Passes.h

GPUCommonPass.h

GPUDialect.h

GPUToNVVMPass.h

GPUToROCDLPass.h

checkMappingAttributeTypes
static DiagnosedSilenceableFailure checkMappingAttributeTypes(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp)
Check if given mapping attributes are one of the desired attributes.
Definition: GPUTransformOps.cpp:341

getSubgroupMmaNativeVectorSize
static std::optional< SmallVector< int64_t > > getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k)
Returns the target vector size for the target operation based on the native vector size specified wit...
Definition: GPUTransformOps.cpp:223

rewriteOneForallCommonImpl
static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > availableMappingSizes, ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder)
Definition: GPUTransformOps.cpp:474

definiteFailureHelper
static DiagnosedSilenceableFailure definiteFailureHelper(std::optional< TransformOpInterface > transformOp, Operation *target, const Twine &message)
Definition: GPUTransformOps.cpp:331

checkMappingSpec
static DiagnosedSilenceableFailure checkMappingSpec(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > numParallelIterations, ArrayRef< int64_t > blockOrGridSizes, int factor, bool useLinearMapping=false)
Definition: GPUTransformOps.cpp:772

replaceUnitMappingIdsHelper
static void replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc, OperationOrBlock *parent, Value replacement, ArrayRef< int64_t > availableMappingSizes)
Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
Definition: GPUTransformOps.cpp:465

gpuMmaUnrollOrder
static std::optional< SmallVector< int64_t > > gpuMmaUnrollOrder(vector::ContractionOp contract)
Pick an unrolling order that will allow tensorcore operation to reuse LHS register.
Definition: GPUTransformOps.cpp:192

verifyGpuMapping
static DiagnosedSilenceableFailure verifyGpuMapping(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp)
Definition: GPUTransformOps.cpp:418

getThreadIdBuilder
static DiagnosedSilenceableFailure getThreadIdBuilder(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > blockSizes, int64_t warpSize, GpuIdBuilder &gpuIdBuilder)
Definition: GPUTransformOps.cpp:798

GPUTransformOps.h

IRMapping.h

getContext
static MLIRContext * getContext(OpFoldResult val)
Definition: IndexingUtils.cpp:277

IndexingUtils.h

MLIRContext.h

diag
static std::string diag(const llvm::Value &value)
Definition: ModuleImport.cpp:55

NVVMDialect.h

OpDefinition.h

ROCDLDialect.h

contract
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
Definition: RootOrdering.cpp:46

TransformDialect.h

TransformInterfaces.h

TypeConverter.h

MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Definition: TypeID.h:331

VectorOps.h

VectorTransforms.h

Visitors.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::DenseSet
Definition: LLVM.h:59

llvm::SetVector
Definition: LLVM.h:66

llvm::SmallVectorImpl
Definition: LLVM.h:74

llvm::SmallVector
Definition: LLVM.h:72

llvm::TypeSwitch
Definition: LLVM.h:82

mlir::AffineExpr
Base type for affine expression.
Definition: AffineExpr.h:68

mlir::Attribute
Attributes are known-constant values of operations.
Definition: Attributes.h:25

mlir::Block
Block represents an ordered list of Operations.
Definition: Block.h:33

mlir::Block::iterator
OpListType::iterator iterator
Definition: Block.h:140

mlir::Block::getOperations
OpListType & getOperations()
Definition: Block.h:137

mlir::Block::front
Operation & front()
Definition: Block.h:153

mlir::Block::begin
iterator begin()
Definition: Block.h:143

mlir::DiagnosedSilenceableFailure
The result of a transform IR operation application.
Definition: DiagnosedSilenceableFailure.h:38

mlir::DiagnosedSilenceableFailure::success
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
Definition: DiagnosedSilenceableFailure.h:48

mlir::DiagnosedSilenceableFailure::succeeded
bool succeeded() const
Returns true if this is a success.
Definition: DiagnosedSilenceableFailure.h:75

mlir::DialectRegistry
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Definition: DialectRegistry.h:139

mlir::DialectRegistry::addExtensions
void addExtensions()
Add the given extensions to the registry.
Definition: DialectRegistry.h:222

mlir::IRMapping
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26

mlir::IRMapping::lookup
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72

mlir::IRMapping::map
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30

mlir::LLVMTypeConverter
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:35

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:63

mlir::OpBuilder::InsertionGuard
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:348

mlir::OpBuilder::getInsertionPoint
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Definition: Builders.h:445

mlir::OpBuilder::clone
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:562

mlir::OpBuilder::setInsertionPointToStart
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:431

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:398

mlir::OpBuilder::setInsertionPointAfter
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:412

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::walk
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:797

mlir::Operation::getLoc
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223

mlir::Operation::getResultTypes
result_type_range getResultTypes()
Definition: Operation.h:428

mlir::Operation::getUsers
user_range getUsers()
Returns a range of all users.
Definition: Operation.h:873

mlir::Operation::getNumResults
unsigned getNumResults()
Return the number of results held by this operation.
Definition: Operation.h:404

mlir::RewritePatternSet
Definition: PatternMatch.h:816

mlir::RewriterBase
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:368

mlir::RewriterBase::eraseOp
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Definition: PatternMatch.cpp:155

mlir::RewriterBase::replaceAllUsesWith
virtual void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Definition: PatternMatch.h:646

mlir::TypeConverter
Type conversion class.
Definition: DialectConversion.h:41

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::WalkResult
A utility result that is used to signal how to proceed with an ongoing walk:
Definition: WalkResult.h:29

mlir::WalkResult::skip
static WalkResult skip()
Definition: WalkResult.h:48

mlir::WalkResult::advance
static WalkResult advance()
Definition: WalkResult.h:47

mlir::WalkResult::wasInterrupted
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition: WalkResult.h:51

mlir::WalkResult::interrupt
static WalkResult interrupt()
Definition: WalkResult.h:46

mlir::arith::ConstantIndexOp::create
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359

mlir::gpu::MMAMatrixType
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Definition: GPUDialect.h:131

mlir::transform::ApplyToEachResultList
A list of results of applying a transform op with ApplyEachOpTrait to a single payload operation,...
Definition: TransformInterfaces.h:1409

mlir::transform::ApplyToEachResultList::push_back
void push_back(Operation *op)
Appends an element to the list.
Definition: TransformInterfaces.h:1442

mlir::transform::TransformDialectExtension
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
Definition: TransformDialect.h:118

mlir::transform::TransformRewriter
This is a special rewriter to be used in transform op implementations, providing additional helper fu...
Definition: TransformInterfaces.h:1108

mlir::transform::TransformState
The state maintained across applications of various ops implementing the TransformOpInterface.
Definition: TransformInterfaces.h:173

Arith.h

MemRef.h

SCF.h

AffineExpr.h

BuiltinAttributes.h

LLVM.h

mlir::OpTrait::hasElementwiseMappableTraits
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
Definition: Operation.cpp:1393

mlir::detail::enumerate
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344

mlir::gpu::amd::HIP
@ HIP
Definition: Runtimes.h:17

mlir::gpu
Definition: GPUCommonPass.h:35

mlir::gpu::registerTransformDialectExtension
void registerTransformDialectExtension(DialectRegistry &registry)
Definition: GPUTransformOps.cpp:993

mlir::remark::failed
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:561

mlir::sparse_tensor::getN
uint64_t getN(LevelType lt)
Definition: Enums.h:442

mlir::sparse_tensor::getM
uint64_t getM(LevelType lt)
Definition: Enums.h:443

mlir::transform::gpu
Definition: GPUTransformOps.h:32

mlir::transform::gpu::findTopLevelForallOp
DiagnosedSilenceableFailure findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp, TransformOpInterface transformOp)
Find the unique top level scf::ForallOp within a given target op.
Definition: GPUTransformOps.cpp:669

mlir::transform::gpu::alterGpuLaunch
DiagnosedSilenceableFailure alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch, TransformOpInterface transformOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Alter kernel configuration of the given kernel.

mlir::transform::gpu::createGpuLaunch
DiagnosedSilenceableFailure createGpuLaunch(RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Create an empty-body gpu::LaunchOp using the provided kernel settings and put a terminator within.

mlir::transform::gpu::mapForallToBlocksImpl
DiagnosedSilenceableFailure mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp, scf::ForallOp forallOp, SmallVectorImpl< int64_t > &gridDims, const GpuIdBuilder &gpuIdBuilder)
Map the top level scf.forall op to GPU blocks.
Definition: GPUTransformOps.cpp:615

mlir::transform::gpu::checkGpuLimits
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, std::optional< int64_t > gridDimX, std::optional< int64_t > gridDimY, std::optional< int64_t > gridDimZ, std::optional< int64_t > blockDimX, std::optional< int64_t > blockDimY, std::optional< int64_t > blockDimZ)
Determine if the size of the kernel configuration is supported by the GPU architecture being used.
Definition: Utils.cpp:360

mlir::transform::gpu::mapNestedForallToThreadsImpl
DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, Operation *target, ArrayRef< int64_t > blockDims, int64_t warpSize, bool syncAfterDistribute)
Search scf.forall ops nested under target and map each such op to an explicit GPU implementation alon...
Definition: GPUTransformOps.cpp:894

mlir::transform::gpu::mapOneForallToThreadsImpl
DiagnosedSilenceableFailure mapOneForallToThreadsImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > blockSizes, int64_t warpSize, bool syncAfterDistribute)
Search scf.forall ops nested under target and map each such op to an explicit GPU implementation alon...
Definition: GPUTransformOps.cpp:854

mlir::transform
Definition: DLTITransformOps.h:18

mlir::vector::isReductionIterator
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
Definition: VectorOps.h:155

mlir::vector::isParallelIterator
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Definition: VectorOps.h:150

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::populateGpuToROCDLConversionPatterns
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
Definition: LowerGpuOpsToROCDLOps.cpp:443

mlir::convertMMAToLLVMType
LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
Definition: WmmaOpsToNvvm.cpp:380

mlir::populateGpuRewritePatterns
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition: Passes.h:91

mlir::computeProduct
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
Definition: IndexingUtils.cpp:84

mlir::emitDefiniteFailure
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
Definition: DiagnosedSilenceableFailure.h:243

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition: GreedyPatternRewriteDriver.h:283

mlir::populateGpuSubgroupReduceOpLoweringPattern
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
Definition: LowerGpuOpsToNVVMOps.cpp:595

mlir::populateGpuToNVVMConversionPatterns
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
Definition: LowerGpuOpsToNVVMOps.cpp:692

mlir::populateGpuMemorySpaceAttributeConversions
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
Definition: GPUOpsLowering.cpp:818

mlir::populateGpuPromoteShuffleToAMDGPUPatterns
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
Definition: PromoteShuffleToAMDGPU.cpp:101

mlir::getConstantIntValues
std::optional< SmallVector< int64_t > > getConstantIntValues(ArrayRef< OpFoldResult > ofrs)
If all ofrs are constant integers or IntegerAttrs, return the integers.
Definition: StaticValueUtils.cpp:142

mlir::verify
LogicalResult verify(Operation *op, bool verifyRecursively=true)
Perform (potentially expensive) checks of invariants, used to detect compiler bugs,...
Definition: Verifier.cpp:423

mlir::getValuesSortedByKey
SmallVector< Value > getValuesSortedByKey(ArrayRef< Attribute > keys, ArrayRef< Value > values, llvm::function_ref< bool(Attribute, Attribute)> compare)
Helper to sort values according to matching keys.
Definition: StaticValueUtils.cpp:259

mlir::populateGpuEliminateBarriersPatterns
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns)
Erase barriers that do not enforce conflicting memory side effects.
Definition: EliminateBarriers.cpp:617

mlir::populateGpuWMMAToNVVMConversionPatterns
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Definition: WmmaOpsToNvvm.cpp:391

ForallRewriteResult
Struct to return the result of the rewrite of a forall operation.
Definition: GPUTransformOps.cpp:457

ForallRewriteResult::mappingIds
SmallVector< Value > mappingIds
Definition: GPUTransformOps.cpp:459

ForallRewriteResult::mappingSizes
SmallVector< int64_t > mappingSizes
Definition: GPUTransformOps.cpp:458

mlir::amdgpu::Chipset::parse
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Definition: Chipset.cpp:14

mlir::transform::gpu::GpuBlockIdBuilder
Builder for gpu::BlockIdOps used to map scf.forall to blocks.
Definition: Utils.h:81

mlir::transform::gpu::GpuIdBuilder
Helper struct for configuring the rewrite of mapped scf.forall ops to various gpu id configurations.
Definition: Utils.h:60

mlir::transform::gpu::GpuIdBuilder::mappingAttributes
SmallVector< DeviceMappingAttrInterface > mappingAttributes
The mapping attributes targeted by this generator.
Definition: Utils.h:69

mlir::transform::gpu::GpuIdBuilder::idBuilder
GpuIdBuilderFnType idBuilder
The constructor that builds the concrete IR for mapping ids.
Definition: Utils.h:72

mlir::transform::gpu::GpuLaneIdBuilder
Builder for lane id.
Definition: Utils.h:130

mlir::transform::gpu::GpuThreadIdBuilder
Builder for warp ids used to map scf.forall to reindexed threads.
Definition: Utils.h:120

mlir::transform::gpu::GpuWarpIdBuilder
Builder for warp ids used to map scf.forall to reindexed warps.
Definition: Utils.h:107

mlir::transform::gpu::GpuWarpgroupIdBuilder
Builder for warpgroup ids used to map scf.forall to reindexed warpgroups.
Definition: Utils.h:92

mlir::transform::gpu::IdBuilderResult
Helper type for functions that generate ids for the mapping of a scf.forall.
Definition: Utils.h:31

mlir::transform::gpu::IdBuilderResult::errorMsg
std::string errorMsg
Error message, if not empty then building the ids failed.
Definition: Utils.h:33

mlir::transform::gpu::IdBuilderResult::predicateOps
SmallVector< Value > predicateOps
Values used to predicate the forall body when activeMappingSizes is smaller than the available mappin...
Definition: Utils.h:38

mlir::transform::gpu::IdBuilderResult::mappingIdOps
SmallVector< Value > mappingIdOps
Values used to replace the forall induction variables.
Definition: Utils.h:35

mlir::vector::UnrollVectorOptions
Options that control the vector unrolling.
Definition: VectorRewritePatterns.h:36