doxygen/EmulateAtomics_8cpp_source.html

 //===- EmulateAtomics.cpp - Emulate unsupported AMDGPU atomics ------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h"


 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"

 #include "mlir/Dialect/AMDGPU/Utils/Chipset.h"

 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"

 #include "mlir/Dialect/Vector/IR/VectorOps.h"

 #include "mlir/IR/BuiltinAttributes.h"

 #include "mlir/IR/TypeUtilities.h"

 #include "mlir/Transforms/DialectConversion.h"


 namespace mlir::amdgpu {

 #define GEN_PASS_DEF_AMDGPUEMULATEATOMICSPASS

 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"

 } // namespace mlir::amdgpu


 using namespace mlir;

 using namespace mlir::amdgpu;


 namespace {

 struct AmdgpuEmulateAtomicsPass

     : public amdgpu::impl::AmdgpuEmulateAtomicsPassBase<

           AmdgpuEmulateAtomicsPass> {

   using AmdgpuEmulateAtomicsPassBase<

       AmdgpuEmulateAtomicsPass>::AmdgpuEmulateAtomicsPassBase;

   void runOnOperation() override;

 };


 template <typename AtomicOp, typename ArithOp>

 struct RawBufferAtomicByCasPattern : public OpConversionPattern<AtomicOp> {

   using OpConversionPattern<AtomicOp>::OpConversionPattern;

   using Adaptor = typename AtomicOp::Adaptor;


   LogicalResult

   matchAndRewrite(AtomicOp atomicOp, Adaptor adaptor,

                   ConversionPatternRewriter &rewriter) const override;

 };

 } // namespace


 namespace {

 enum class DataArgAction : unsigned char {

   Duplicate,

   Drop,

 };

 } // namespace


 // Fix up the fact that, when we're migrating from a general bugffer atomic

 // to a load or to a CAS, the number of openrands, and thus the number of

 // entries needed in operandSegmentSizes, needs to change. We use this method

 // because we'd like to preserve unknown attributes on the atomic instead of

 // discarding them.

 static void patchOperandSegmentSizes(ArrayRef<NamedAttribute> attrs,

                                      SmallVectorImpl<NamedAttribute> &newAttrs,

                                      DataArgAction action) {

   newAttrs.reserve(attrs.size());

   for (NamedAttribute attr : attrs) {

     if (attr.getName().getValue() != "operandSegmentSizes") {

       newAttrs.push_back(attr);

       continue;

     }

     auto segmentAttr = cast<DenseI32ArrayAttr>(attr.getValue());

     MLIRContext *context = segmentAttr.getContext();

     DenseI32ArrayAttr newSegments;

     switch (action) {

     case DataArgAction::Drop:

       newSegments = DenseI32ArrayAttr::get(

           context, segmentAttr.asArrayRef().drop_front());

       break;

     case DataArgAction::Duplicate: {

       SmallVector<int32_t> newVals;

       ArrayRef<int32_t> oldVals = segmentAttr.asArrayRef();

       newVals.push_back(oldVals[0]);

       newVals.append(oldVals.begin(), oldVals.end());

       newSegments = DenseI32ArrayAttr::get(context, newVals);

       break;

     }

     }

     newAttrs.push_back(NamedAttribute(attr.getName(), newSegments));

   }

 }


 // A helper function to flatten a vector value to a scalar containing its bits,

 // returning the value itself if othetwise.

 static Value flattenVecToBits(ConversionPatternRewriter &rewriter, Location loc,

                               Value val) {

   auto vectorType = dyn_cast<VectorType>(val.getType());

   if (!vectorType)

     return val;


   int64_t bitwidth =

       vectorType.getElementTypeBitWidth() * vectorType.getNumElements();

   Type allBitsType = rewriter.getIntegerType(bitwidth);

   auto allBitsVecType = VectorType::get({1}, allBitsType);

   Value bitcast = vector::BitCastOp::create(rewriter, loc, allBitsVecType, val);

   Value scalar = vector::ExtractOp::create(rewriter, loc, bitcast, 0);

   return scalar;

 }


 template <typename AtomicOp, typename ArithOp>

 LogicalResult RawBufferAtomicByCasPattern<AtomicOp, ArithOp>::matchAndRewrite(

     AtomicOp atomicOp, Adaptor adaptor,

     ConversionPatternRewriter &rewriter) const {

   Location loc = atomicOp.getLoc();


   ArrayRef<NamedAttribute> origAttrs = atomicOp->getAttrs();

   ValueRange operands = adaptor.getOperands();

   Value data = operands.take_front()[0];

   ValueRange invariantArgs = operands.drop_front();

   Type dataType = data.getType();


   SmallVector<NamedAttribute> loadAttrs;

   patchOperandSegmentSizes(origAttrs, loadAttrs, DataArgAction::Drop);

   Value initialLoad = RawBufferLoadOp::create(rewriter, loc, dataType,

                                               invariantArgs, loadAttrs);

   Block *currentBlock = rewriter.getInsertionBlock();

   Block *afterAtomic =

       rewriter.splitBlock(currentBlock, rewriter.getInsertionPoint());

   Block *loopBlock = rewriter.createBlock(afterAtomic, {dataType}, {loc});


   rewriter.setInsertionPointToEnd(currentBlock);

   cf::BranchOp::create(rewriter, loc, loopBlock, initialLoad);


   rewriter.setInsertionPointToEnd(loopBlock);

   Value prevLoad = loopBlock->getArgument(0);

   Value operated = ArithOp::create(rewriter, loc, data, prevLoad);

   dataType = operated.getType();


   SmallVector<NamedAttribute> cmpswapAttrs;

   patchOperandSegmentSizes(origAttrs, cmpswapAttrs, DataArgAction::Duplicate);

   SmallVector<Value> cmpswapArgs = {operated, prevLoad};

   cmpswapArgs.append(invariantArgs.begin(), invariantArgs.end());

   Value atomicRes = RawBufferAtomicCmpswapOp::create(rewriter, loc, dataType,

                                                      cmpswapArgs, cmpswapAttrs);


   // We care about exact bitwise equality here, so do some bitcasts.

   // These will fold away during lowering to the ROCDL dialect, where

   // an int->float bitcast is introduced to account for the fact that cmpswap

   // only takes integer arguments.


   Value prevLoadForCompare = flattenVecToBits(rewriter, loc, prevLoad);

   Value atomicResForCompare = flattenVecToBits(rewriter, loc, atomicRes);

   if (auto floatDataTy = dyn_cast<FloatType>(dataType)) {

     Type equivInt = rewriter.getIntegerType(floatDataTy.getWidth());

     prevLoadForCompare =

         arith::BitcastOp::create(rewriter, loc, equivInt, prevLoad);

     atomicResForCompare =

         arith::BitcastOp::create(rewriter, loc, equivInt, atomicRes);

   }

   Value canLeave =

       arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,

                             atomicResForCompare, prevLoadForCompare);

   cf::CondBranchOp::create(rewriter, loc, canLeave, afterAtomic, ValueRange{},

                            loopBlock, atomicRes);

   rewriter.eraseOp(atomicOp);

   return success();

 }


 void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(

     ConversionTarget &target, RewritePatternSet &patterns, Chipset chipset,

     PatternBenefit benefit) {

   // gfx10 has no atomic adds.

   if (chipset.majorVersion == 10 || chipset < Chipset(9, 0, 8)) {

     target.addIllegalOp<RawBufferAtomicFaddOp>();

   }

   // gfx11 has no fp16 atomics

   if (chipset.majorVersion == 11) {

     target.addDynamicallyLegalOp<RawBufferAtomicFaddOp>(

         [](RawBufferAtomicFaddOp op) -> bool {

           Type elemType = getElementTypeOrSelf(op.getValue().getType());

           return !isa<Float16Type, BFloat16Type>(elemType);

         });

   }

   // gfx9 has no to a very limited support for floating-point min and max.

   if (chipset.majorVersion == 9) {

     if (chipset >= Chipset(9, 0, 0xa)) {

       // gfx90a supports f64 max (and min, but we don't have a min wrapper right

       // now) but all other types need to be emulated.

       target.addDynamicallyLegalOp<RawBufferAtomicFmaxOp>(

           [](RawBufferAtomicFmaxOp op) -> bool {

             return op.getValue().getType().isF64();

           });

     } else {

       target.addIllegalOp<RawBufferAtomicFmaxOp>();

     }

     // TODO(https://github.com/llvm/llvm-project/issues/129206): Refactor

     // this to avoid hardcoding ISA version: gfx950 has bf16 atomics.

     if (chipset < Chipset(9, 5, 0)) {

       target.addDynamicallyLegalOp<RawBufferAtomicFaddOp>(

           [](RawBufferAtomicFaddOp op) -> bool {

             Type elemType = getElementTypeOrSelf(op.getValue().getType());

             return !isa<BFloat16Type>(elemType);

           });

     }

   }

   patterns.add<

       RawBufferAtomicByCasPattern<RawBufferAtomicFaddOp, arith::AddFOp>,

       RawBufferAtomicByCasPattern<RawBufferAtomicFmaxOp, arith::MaximumFOp>,

       RawBufferAtomicByCasPattern<RawBufferAtomicSmaxOp, arith::MaxSIOp>,

       RawBufferAtomicByCasPattern<RawBufferAtomicUminOp, arith::MinUIOp>>(

       patterns.getContext(), benefit);

 }


 void AmdgpuEmulateAtomicsPass::runOnOperation() {

   Operation *op = getOperation();

   FailureOr<Chipset> maybeChipset = Chipset::parse(chipset);

   if (failed(maybeChipset)) {

     emitError(op->getLoc(), "Invalid chipset name: " + chipset);

     return signalPassFailure();

   }


   MLIRContext &ctx = getContext();

   ConversionTarget target(ctx);

   RewritePatternSet patterns(&ctx);

   target.markUnknownOpDynamicallyLegal(

       [](Operation *op) -> bool { return true; });


   populateAmdgpuEmulateAtomicsPatterns(target, patterns, *maybeChipset);

   if (failed(applyPartialConversion(op, target, std::move(patterns))))

     return signalPassFailure();

 }

AMDGPUDialect.h

Chipset.h

ControlFlowOps.h

DialectConversion.h

Passes.h

flattenVecToBits
static Value flattenVecToBits(ConversionPatternRewriter &rewriter, Location loc, Value val)
Definition: EmulateAtomics.cpp:92

patchOperandSegmentSizes
static void patchOperandSegmentSizes(ArrayRef< NamedAttribute > attrs, SmallVectorImpl< NamedAttribute > &newAttrs, DataArgAction action)
Definition: EmulateAtomics.cpp:60

getContext
static MLIRContext * getContext(OpFoldResult val)
Definition: IndexingUtils.cpp:277

TypeUtilities.h

VectorOps.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::SmallVectorImpl
Definition: LLVM.h:74

llvm::SmallVector
Definition: LLVM.h:72

mlir::Block
Block represents an ordered list of Operations.
Definition: Block.h:33

mlir::Block::getArgument
BlockArgument getArgument(unsigned i)
Definition: Block.h:129

mlir::Builder::getIntegerType
IntegerType getIntegerType(unsigned width)
Definition: Builders.cpp:67

mlir::ConversionPatternRewriter
This class implements a pattern rewriter for use with ConversionPatterns.
Definition: DialectConversion.h:839

mlir::ConversionPatternRewriter::eraseOp
void eraseOp(Operation *op) override
PatternRewriter hook for erasing a dead operation.
Definition: DialectConversion.cpp:2139

mlir::ConversionTarget
This class describes a specific conversion target.
Definition: DialectConversion.h:1034

mlir::ConversionTarget::addDynamicallyLegalOp
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
Definition: DialectConversion.h:1094

mlir::ConversionTarget::addIllegalOp
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Definition: DialectConversion.h:1118

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:63

mlir::NamedAttribute
NamedAttribute represents a combination of a name and an Attribute value.
Definition: Attributes.h:164

mlir::OpBuilder::getInsertionPoint
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Definition: Builders.h:445

mlir::OpBuilder::createBlock
Block * createBlock(Region *parent, Region::iterator insertPt={}, TypeRange argTypes={}, ArrayRef< Location > locs={})
Add new block with 'argTypes' arguments and set the insertion point to the end of it.
Definition: Builders.cpp:430

mlir::OpBuilder::setInsertionPointToEnd
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:436

mlir::OpBuilder::getInsertionBlock
Block * getInsertionBlock() const
Return the block the current insertion point belongs to.
Definition: Builders.h:442

mlir::OpConversionPattern
OpConversionPattern is a wrapper around ConversionPattern that allows for matching and rewriting agai...
Definition: DialectConversion.h:684

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::getLoc
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223

mlir::PatternBenefit
This class represents the benefit of a pattern match in a unitless scheme that ranges from 0 (very li...
Definition: PatternMatch.h:34

mlir::RewritePatternSet
Definition: PatternMatch.h:816

mlir::RewriterBase::splitBlock
Block * splitBlock(Block *block, Block::iterator before)
Split the operations starting at "before" (inclusive) out of the given block into a new block,...
Definition: PatternMatch.cpp:350

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:105

mlir::detail::DenseArrayAttrImpl< int32_t >

mlir::detail::DenseArrayAttrImpl< int32_t >::get
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
Builder from ArrayRef<T>.
Definition: BuiltinAttributes.cpp:872

Arith.h

BuiltinAttributes.h

mlir::amdgpu
Definition: GPUToROCDLPass.h:24

mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns
void populateAmdgpuEmulateAtomicsPatterns(ConversionTarget &target, RewritePatternSet &patterns, Chipset chipset, PatternBenefit benefit=1)
Definition: EmulateAtomics.cpp:166

mlir::remark::failed
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:561

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::emitError
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
Definition: Diagnostics.cpp:332

mlir::getElementTypeOrSelf
Type getElementTypeOrSelf(Type type)
Return the element type or return the type itself.
Definition: TypeUtilities.cpp:23

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition: GreedyPatternRewriteDriver.h:283

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::applyPartialConversion
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
Definition: DialectConversion.cpp:4147

mlir::amdgpu::Chipset
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
Definition: Chipset.h:22

mlir::amdgpu::Chipset::majorVersion
unsigned majorVersion
Definition: Chipset.h:23

mlir::amdgpu::Chipset::parse
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Definition: Chipset.cpp:14