doxygen/LowerGpuOpsToROCDLOps%5F8cpp%5Fsource.html

 //===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements a pass to generate ROCDLIR operations for higher-level

 // GPU operations.

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"

 #include "mlir/Dialect/Arith/Transforms/Passes.h"

 #include "mlir/Pass/Pass.h"

 #include "mlir/Pass/PassManager.h"

 #include "mlir/Transforms/Passes.h"


 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"

 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"

 #include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h"

 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"

 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"

 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"

 #include "mlir/Conversion/LLVMCommon/Pattern.h"

 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"

 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"

 #include "mlir/Conversion/MathToROCDL/MathToROCDL.h"

 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"

 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"

 #include "mlir/Dialect/Func/IR/FuncOps.h"

 #include "mlir/Dialect/GPU/IR/GPUDialect.h"

 #include "mlir/Dialect/GPU/Transforms/Passes.h"

 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"

 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"

 #include "mlir/Dialect/Math/IR/Math.h"

 #include "mlir/Dialect/MemRef/IR/MemRef.h"

 #include "mlir/Dialect/Vector/IR/VectorOps.h"

 #include "mlir/IR/BuiltinAttributes.h"

 #include "mlir/Pass/Pass.h"

 #include "mlir/Transforms/DialectConversion.h"

 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"

 #include "llvm/Support/FormatVariadic.h"


 #include "../GPUCommon/GPUOpsLowering.h"

 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"


 namespace mlir {

 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS

 #include "mlir/Conversion/Passes.h.inc"

 } // namespace mlir


 using namespace mlir;


 // Truncate or extend the result depending on the index bitwidth specified

 // by the LLVMTypeConverter options.

 static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,

                                   Location loc, Value value,

                                   const LLVMTypeConverter &converter) {

   int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();

   int64_t indexBitwidth = converter.getIndexTypeBitwidth();

   auto indexBitwidthType =

       IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());

   // TODO: use <=> in C++20.

   if (indexBitwidth > intWidth) {

     return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);

   }

   if (indexBitwidth < intWidth) {

     return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);

   }

   return value;

 }


 /// Returns true if the given `gpu.func` can be safely called using the bare

 /// pointer calling convention.

 static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {

   bool canBeBare = true;

   for (Type type : func.getArgumentTypes())

     if (auto memrefTy = dyn_cast<BaseMemRefType>(type))

       canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);

   return canBeBare;

 }


 static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,

                        const unsigned indexBitwidth) {

   auto int32Type = IntegerType::get(rewriter.getContext(), 32);

   Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);

   Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);

   Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,

                                                     ValueRange{minus1, zero});

   Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,

                                                    ValueRange{minus1, mbcntLo});

   return laneId;

 }

 static constexpr StringLiteral amdgcnDataLayout =

     "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"

     "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"

     "32-v32:"

     "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"

     "64-S32-A5-G1-ni:7:8:9";


 namespace {

 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {

   using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;


   LogicalResult

   matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,

                   ConversionPatternRewriter &rewriter) const override {

     auto loc = op->getLoc();

     MLIRContext *context = rewriter.getContext();

     // convert to:  %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)

     // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)


     Type intTy = IntegerType::get(context, 32);

     Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);

     Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);

     Value mbcntLo =

         rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});

     Value laneId = rewriter.create<ROCDL::MbcntHiOp>(

         loc, intTy, ValueRange{minus1, mbcntLo});

     // Truncate or extend the result depending on the index bitwidth specified

     // by the LLVMTypeConverter options.

     const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();

     if (indexBitwidth > 32) {

       laneId = rewriter.create<LLVM::SExtOp>(

           loc, IntegerType::get(context, indexBitwidth), laneId);

     } else if (indexBitwidth < 32) {

       laneId = rewriter.create<LLVM::TruncOp>(

           loc, IntegerType::get(context, indexBitwidth), laneId);

     }

     rewriter.replaceOp(op, {laneId});

     return success();

   }

 };


 struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {

   using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;


   GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,

                            amdgpu::Chipset chipset)

       : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp>(converter),

         chipset(chipset) {}


   LogicalResult

   matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,

                   ConversionPatternRewriter &rewriter) const override {

     LLVM::ConstantRangeAttr bounds = nullptr;

     bool isBeforeGfx10 = chipset.majorVersion < 10;

     if (auto upperBoundAttr = op.getUpperBoundAttr()) {

       bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(

           /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,

           /*upper=*/op.getUpperBoundAttr().getInt() + 1);

     }

     Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(

         op.getLoc(), rewriter.getI32Type(), bounds);

     wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,

                                        *getTypeConverter());

     rewriter.replaceOp(op, {wavefrontOp});

     return success();

   }


   const amdgpu::Chipset chipset;

 };


 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {

   using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;


   /// Lowers a shuffle to the corresponding ROCDL ops.

   ///

   /// Use the `width` argument to see if src lane is participating.

   /// If not the dstLane would be itself.

   ///

   ///  Shuffle with DS Bpermute:

   ///   let shflMode = [xor, up, down, idx]

   ///   let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].

   ///   1. curLaneId = using mbcnt.lo + mbcnt.hi

   ///   2. widthOrZeroIfOutside = (curLaneId + width) & -width

   ///   3. dstLane = shflMode(curLaneId, step)

   ///   4. isActiveSrcLane = dstLane < isActiveSrcLane

   ///   5. dstLane = isActiveSrcLane ? dstLane : curLaneId

   ///   6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.

   ///   7. bpermute(dwordAlignedDstLane, shfl_value).

   ///

   LogicalResult

   matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,

                   ConversionPatternRewriter &rewriter) const override {

     Location loc = op->getLoc();

     Value initShflValue = adaptor.getValue();


     const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();

     Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);


     auto int32Type = IntegerType::get(rewriter.getContext(), 32);

     Value width = adaptor.getWidth();

     Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);

     Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);

     Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);

     Value widthOrZeroIfOutside =

         rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);

     Value dstLane;


     switch (op.getMode()) {

     case gpu::ShuffleMode::UP:

       dstLane = rewriter.create<LLVM::SubOp>(loc, int32Type, srcLaneId,

                                              adaptor.getOffset());

       break;

     case gpu::ShuffleMode::DOWN:

       dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,

                                              adaptor.getOffset());

       break;

     case gpu::ShuffleMode::XOR:

       dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,

                                              adaptor.getOffset());

       break;

     case gpu::ShuffleMode::IDX:

       dstLane = adaptor.getOffset();

       break;

     }

     Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(

         loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);

     Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,

                                                           dstLane, srcLaneId);

     Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);

     Value dwordAlignedDstLane =

         rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);


     SmallVector<Value> decomposed =

         LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type);

     SmallVector<Value> swizzled;

     for (Value v : decomposed) {

       Value res = rewriter.create<ROCDL::DsBpermuteOp>(loc, int32Type,

                                                        dwordAlignedDstLane, v);

       swizzled.emplace_back(res);

     }

     Value shflValue =

         LLVM::composeValue(rewriter, loc, swizzled, initShflValue.getType());

     rewriter.replaceOp(op, {shflValue, isActiveSrcLane});

     return success();

   }

 };


 /// Import the GPU Ops to ROCDL Patterns.

 #include "GPUToROCDL.cpp.inc"


 // A pass that replaces all occurrences of GPU device operations with their

 // corresponding ROCDL equivalent.

 //

 // This pass only handles device code and is not meant to be run on GPU host

 // code.

 struct LowerGpuOpsToROCDLOpsPass final

     : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {

   LowerGpuOpsToROCDLOpsPass() = default;

   LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,

                             bool useBarePtrCallConv,

                             gpu::amd::Runtime runtime) {

     if (this->chipset.getNumOccurrences() == 0)

       this->chipset = chipset;

     if (this->indexBitwidth.getNumOccurrences() == 0)

       this->indexBitwidth = indexBitwidth;

     if (this->useBarePtrCallConv.getNumOccurrences() == 0)

       this->useBarePtrCallConv = useBarePtrCallConv;

     if (this->runtime.getNumOccurrences() == 0)

       this->runtime = runtime;

   }


   void getDependentDialects(DialectRegistry &registry) const override {

     Base::getDependentDialects(registry);

     registerConvertToLLVMDependentDialectLoading(registry);

   }


   void runOnOperation() override {

     gpu::GPUModuleOp m = getOperation();

     MLIRContext *ctx = m.getContext();


     auto llvmDataLayout = m->getAttrOfType<StringAttr>(

         LLVM::LLVMDialect::getDataLayoutAttrName());

     if (!llvmDataLayout) {

       llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);

       m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);

     }

     // Request C wrapper emission.

     for (auto func : m.getOps<func::FuncOp>()) {

       func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),

                     UnitAttr::get(ctx));

     }


     FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);

     if (failed(maybeChipset)) {

       emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);

       return signalPassFailure();

     }


     /// Customize the bitwidth used for the device side index computations.

     LowerToLLVMOptions options(

         ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));

     options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());

     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)

       options.overrideIndexBitwidth(indexBitwidth);


     if (useBarePtrCallConv) {

       options.useBarePtrCallConv = true;

       WalkResult canUseBarePointers =

           m.walk([](gpu::GPUFuncOp func) -> WalkResult {

             if (canBeCalledWithBarePointers(func))

               return WalkResult::advance();

             return WalkResult::interrupt();

           });

       if (canUseBarePointers.wasInterrupted()) {

         emitError(UnknownLoc::get(ctx),

                   "bare pointer calling convention requires all memrefs to "

                   "have static shape and use the identity map");

         return signalPassFailure();

       }

     }


     // Apply in-dialect lowering. In-dialect lowering will replace

     // ops which need to be lowered further, which is not supported by a

     // single conversion pass.

     {

       RewritePatternSet patterns(ctx);

       populateGpuRewritePatterns(patterns);

       populateGpuPromoteShuffleToAMDGPUPatterns(patterns);

       (void)applyPatternsGreedily(m, std::move(patterns));

     }


     LLVMTypeConverter converter(ctx, options);

     populateGpuMemorySpaceAttributeConversions(

         converter, [](gpu::AddressSpace space) {

           switch (space) {

           case gpu::AddressSpace::Global:

             return 1;

           case gpu::AddressSpace::Workgroup:

             return 3;

           case gpu::AddressSpace::Private:

             return 5;

           }

           llvm_unreachable("unknown address space enum value");

           return 0;

         });


     RewritePatternSet llvmPatterns(ctx);

     LLVMConversionTarget target(getContext());


     llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),

                                                       allowedDialects.end());

     for (Dialect *dialect : ctx->getLoadedDialects()) {

       bool allowed = allowedDialectsSet.contains(dialect->getNamespace());

       // Empty `allowedDialectsSet` means all dialects are allowed.

       if (!allowedDialectsSet.empty() && !allowed)

         continue;


       auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);

       if (!iface) {

         // Error out if dialect was explicily specified but doesn't implement

         // conversion interface.

         if (allowed) {

           m.emitError()

               << "dialect does not implement ConvertToLLVMPatternInterface: "

               << dialect->getNamespace();

           return signalPassFailure();

         }

         continue;

       }


       iface->populateConvertToLLVMConversionPatterns(target, converter,

                                                      llvmPatterns);

     }


     populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,

                                             *maybeChipset);

     populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,

                                          *maybeChipset);

     configureGpuToROCDLConversionLegality(target);

     if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))

       signalPassFailure();

     auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();

     auto reqdWorkGroupSizeAttrHelper =

         rocdlDialect->getReqdWorkGroupSizeAttrHelper();

     auto flatWorkGroupSizeAttrHelper =

         rocdlDialect->getFlatWorkGroupSizeAttrHelper();

     // Manually rewrite known block size attributes so the LLVMIR translation

     // infrastructure can pick them up.

     m.walk([&](LLVM::LLVMFuncOp op) {

       if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {

         auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);

         // Also set up the rocdl.flat_work_group_size attribute to prevent

         // conflicting metadata.

         uint32_t flatSize = 1;

         for (uint32_t size : blockSizes.asArrayRef()) {

           flatSize *= size;

         }

         StringAttr flatSizeAttr =

             StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));

         flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);

       }

     });

   }

 };


 } // namespace


 void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {

   target.addIllegalOp<func::FuncOp>();

   target.addLegalDialect<::mlir::LLVM::LLVMDialect>();

   target.addLegalDialect<ROCDL::ROCDLDialect>();

   target.addIllegalDialect<gpu::GPUDialect>();

   target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,

                       LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,

                       LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();

   // These ops are legal for f32 type.

   target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {

     return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);

   });

   // TODO: Remove once we support replacing non-root ops.

   target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();

 }


 void mlir::populateGpuToROCDLConversionPatterns(

     const LLVMTypeConverter &converter, RewritePatternSet &patterns,

     mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) {

   using gpu::index_lowering::IndexKind;

   using gpu::index_lowering::IntrType;

   using mlir::gpu::amd::Runtime;

   auto *rocdlDialect =

       converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();

   populateWithGenerated(patterns);

   patterns.add<

       gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,

                                       ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(

       converter, IndexKind::Block, IntrType::Id);

   patterns.add<gpu::index_lowering::OpLowering<

       gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(

       converter, IndexKind::Grid, IntrType::Id);

   patterns.add<

       gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,

                                       ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(

       converter, IndexKind::Block, IntrType::Dim);

   patterns.add<gpu::index_lowering::OpLowering<

       gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(

       converter, IndexKind::Grid, IntrType::Dim);

   patterns.add<GPUReturnOpLowering>(converter);

   patterns.add<GPUFuncOpLowering>(

       converter,

       GPUFuncOpLoweringOptions{

           /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,

           /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,

           rocdlDialect->getKernelAttrHelper().getName(),

           rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});

   if (Runtime::HIP == runtime) {

     patterns.add<GPUPrintfOpToHIPLowering>(converter);

   } else if (Runtime::OpenCL == runtime) {

     // Use address space = 4 to match the OpenCL definition of printf()

     patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);

   }

   // TODO: Add alignment for workgroup memory

   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);


   patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);

   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);


   populateMathToROCDLConversionPatterns(converter, patterns);

 }


 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>

 mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,

                                       unsigned indexBitwidth,

                                       bool useBarePtrCallConv,

                                       gpu::amd::Runtime runtime) {

   return std::make_unique<LowerGpuOpsToROCDLOpsPass>(

       chipset, indexBitwidth, useBarePtrCallConv, runtime);

 }

AMDGPUDialect.h

AMDGPUToROCDL.h

ConversionTarget.h

DialectConversion.h

Passes.h

Passes.h

FuncOps.h

GPUCommonPass.h

GPUDialect.h

GPUToROCDLPass.h

GreedyPatternRewriteDriver.h

getContext
static MLIRContext * getContext(OpFoldResult val)
Definition: IndexingUtils.cpp:295

LLVMDialect.h

canBeCalledWithBarePointers
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
Definition: LowerGpuOpsToROCDLOps.cpp:77

amdgcnDataLayout
static constexpr StringLiteral amdgcnDataLayout
Definition: LowerGpuOpsToROCDLOps.cpp:96

getLaneId
static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, const unsigned indexBitwidth)
Definition: LowerGpuOpsToROCDLOps.cpp:85

truncOrExtToLLVMType
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
Definition: LowerGpuOpsToROCDLOps.cpp:58

LoweringOptions.h

MathToLLVM.h

MathToROCDL.h

options
static llvm::ManagedStatic< PassManagerOptions > options
Definition: PassManagerOptions.cpp:89

PassManager.h

ROCDLDialect.h

ToLLVMInterface.h

ToLLVMPass.h

Passes.h

TypeConverter.h

VectorOps.h

llvm::SmallVector
Definition: LLVM.h:72

mlir::Builder::getI32Type
IntegerType getI32Type()
Definition: Builders.cpp:65

mlir::Builder::getContext
MLIRContext * getContext() const
Definition: Builders.h:55

mlir::Builder::getAttr
Attr getAttr(Args &&...args)
Get or construct an instance of the attribute Attr with provided arguments.
Definition: Builders.h:96

mlir::ConversionPatternRewriter
This class implements a pattern rewriter for use with ConversionPatterns.
Definition: DialectConversion.h:726

mlir::ConversionPatternRewriter::replaceOp
void replaceOp(Operation *op, ValueRange newValues) override
Replace the given operation with the new values.
Definition: DialectConversion.cpp:1665

mlir::ConversionTarget
This class describes a specific conversion target.
Definition: DialectConversion.h:868

mlir::ConversionTarget::addLegalOp
void addLegalOp(OperationName op)
Register the given operations as legal.
Definition: DialectConversion.h:913

mlir::ConversionTarget::addLegalDialect
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
Definition: DialectConversion.h:995

mlir::ConversionTarget::addDynamicallyLegalOp
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
Definition: DialectConversion.h:928

mlir::ConversionTarget::addIllegalDialect
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
Definition: DialectConversion.h:1030

mlir::ConversionTarget::addIllegalOp
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Definition: DialectConversion.h:952

mlir::ConvertOpToLLVMPattern
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
Definition: Pattern.h:195

mlir::ConvertOpToLLVMPattern::ConvertOpToLLVMPattern
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
Definition: Pattern.h:201

mlir::DataLayout
The main mechanism for performing data layout queries.
Definition: DataLayoutInterfaces.h:220

mlir::DialectRegistry
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Definition: DialectRegistry.h:139

mlir::Dialect
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Definition: Dialect.h:38

mlir::LLVMConversionTarget
Derived class that automatically populates legalization information for different LLVM ops.
Definition: ConversionTarget.h:17

mlir::LLVMTypeConverter
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:35

mlir::LLVMTypeConverter::canConvertToBarePtr
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
Definition: TypeConverter.cpp:593

mlir::LLVMTypeConverter::getContext
MLIRContext & getContext() const
Returns the MLIR context.
Definition: TypeConverter.cpp:276

mlir::LLVMTypeConverter::getIndexTypeBitwidth
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
Definition: TypeConverter.h:143

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::LowerToLLVMOptions
Options to control the LLVM lowering.
Definition: LoweringOptions.h:30

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60

mlir::MLIRContext::getLoadedDialect
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
Definition: MLIRContext.cpp:435

mlir::MLIRContext::getLoadedDialects
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
Definition: MLIRContext.cpp:415

mlir::OpBuilder::create
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:455

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::RewritePatternSet
Definition: PatternMatch.h:772

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:105

mlir::WalkResult
A utility result that is used to signal how to proceed with an ongoing walk:
Definition: Visitors.h:33

mlir::WalkResult::advance
static WalkResult advance()
Definition: Visitors.h:51

mlir::WalkResult::wasInterrupted
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition: Visitors.h:55

mlir::WalkResult::interrupt
static WalkResult interrupt()
Definition: Visitors.h:50

Pattern.h

Pass.h

ControlFlow.h

Math.h

MemRef.h

BuiltinAttributes.h

mlir::LLVM::composeValue
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
Definition: Pattern.cpp:448

mlir::LLVM::decomposeValue
SmallVector< Value > decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Definition: Pattern.cpp:409

mlir::gpu::amd::Runtime
Runtime
Potential runtimes for AMD GPU kernels.
Definition: Runtimes.h:15

mlir::gpu::amd::OpenCL
@ OpenCL
Definition: Runtimes.h:18

mlir::gpu::amd::HIP
@ HIP
Definition: Runtimes.h:17

mlir::gpu::index_lowering::IndexKind
IndexKind
Definition: IndexIntrinsicsOpLowering.h:20

mlir::gpu::index_lowering::IntrType
IntrType
Definition: IndexIntrinsicsOpLowering.h:21

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::populateGpuToROCDLConversionPatterns
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
Definition: LowerGpuOpsToROCDLOps.cpp:419

mlir::kDeriveIndexBitwidthFromDataLayout
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
Definition: LoweringOptions.h:26

mlir::applyPatternsGreedily
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
Definition: GreedyPatternRewriteDriver.cpp:898

mlir::populateGpuRewritePatterns
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition: Passes.h:91

mlir::emitError
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
Definition: Diagnostics.cpp:328

mlir::configureGpuToROCDLConversionLegality
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
Definition: LowerGpuOpsToROCDLOps.cpp:403

mlir::createLowerGpuOpsToROCDLOpsPass
std::unique_ptr< OperationPass< gpu::GPUModuleOp > > createLowerGpuOpsToROCDLOpsPass(const std::string &chipset="gfx900", unsigned indexBitwidth=kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv=false, gpu::amd::Runtime runtime=gpu::amd::Runtime::Unknown)
Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
Definition: LowerGpuOpsToROCDLOps.cpp:466

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition: GreedyPatternRewriteDriver.h:283

mlir::registerConvertToLLVMDependentDialectLoading
void registerConvertToLLVMDependentDialectLoading(DialectRegistry &registry)
Register the extension that will load dependent dialects for LLVM conversion.
Definition: ConvertToLLVMPass.cpp:279

mlir::populateGpuMemorySpaceAttributeConversions
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
Definition: GPUOpsLowering.cpp:811

mlir::populateAMDGPUToROCDLConversionPatterns
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces,...
Definition: AMDGPUToROCDL.cpp:1716

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::populateGpuPromoteShuffleToAMDGPUPatterns
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
Definition: PromoteShuffleToAMDGPU.cpp:61

mlir::applyPartialConversion
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
Definition: DialectConversion.cpp:3390

mlir::populateMathToROCDLConversionPatterns
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate the given list with patterns that convert from Math to ROCDL calls.
Definition: MathToROCDL.cpp:48

mlir::GPUDynamicSharedMemoryOpLowering
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
Definition: GPUOpsLowering.h:42

mlir::GPUFuncOpLoweringOptions
Definition: GPUOpsLowering.h:60

mlir::GPUFuncOpLowering
Definition: GPUOpsLowering.h:83

mlir::GPUPrintfOpToHIPLowering
The lowering of gpu.printf to a call to HIP hostcalls.
Definition: GPUOpsLowering.h:129

mlir::GPUPrintfOpToLLVMCallLowering
The lowering of gpu.printf to a call to an external printf() function.
Definition: GPUOpsLowering.h:144

mlir::GPUReturnOpLowering
Definition: GPUOpsLowering.h:168

mlir::amdgpu::Chipset
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
Definition: Chipset.h:22

mlir::amdgpu::Chipset::parse
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Definition: Chipset.cpp:14

mlir::gpu::index_lowering::OpLowering
Definition: IndexIntrinsicsOpLowering.h:33