doxygen/AffineDataCopyGeneration_8cpp_source.html

 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements a pass to automatically promote accessed memref regions

 // to buffers in a faster memory space that is explicitly managed, with the

 // necessary data movement operations performed through either regular

 // point-wise load/store's or DMAs. Such explicit copying (also referred to as

 // array packing/unpacking in the literature), when done on arrays that exhibit

 // reuse, results in near elimination of conflict misses, TLB misses, reduced

 // use of hardware prefetch streams, and reduced false sharing. It is also

 // necessary for hardware that explicitly managed levels in the memory

 // hierarchy, and where DMAs may have to be used. This optimization is often

 // performed on already tiled code.

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/Affine/Passes.h"


 #include "mlir/Dialect/Affine/Analysis/Utils.h"

 #include "mlir/Dialect/Affine/IR/AffineOps.h"

 #include "mlir/Dialect/Affine/LoopUtils.h"

 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/Func/IR/FuncOps.h"

 #include "mlir/Dialect/MemRef/IR/MemRef.h"

 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"

 #include "llvm/ADT/MapVector.h"

 #include "llvm/Support/CommandLine.h"

 #include "llvm/Support/Debug.h"

 #include <algorithm>

 #include <optional>


 namespace mlir {

 namespace affine {

 #define GEN_PASS_DEF_AFFINEDATACOPYGENERATION

 #include "mlir/Dialect/Affine/Passes.h.inc"

 } // namespace affine

 } // namespace mlir


 #define DEBUG_TYPE "affine-data-copy-generate"


 using namespace mlir;

 using namespace mlir::affine;


 namespace {


 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by

 /// introducing copy operations to transfer data into `fastMemorySpace` and

 /// rewriting the original load's/store's to instead load/store from the

 /// allocated fast memory buffers. Additional options specify the identifier

 /// corresponding to the fast memory space and the amount of fast memory space

 /// available. The pass traverses through the nesting structure, recursing to

 /// inner levels if necessary to determine at what depth copies need to be

 /// placed so that the allocated buffers fit within the memory capacity

 /// provided.

 // TODO: We currently can't generate copies correctly when stores

 // are strided. Check for strided stores.

 struct AffineDataCopyGeneration

     : public affine::impl::AffineDataCopyGenerationBase<

           AffineDataCopyGeneration> {

   AffineDataCopyGeneration() = default;

   explicit AffineDataCopyGeneration(unsigned slowMemorySpace,

                                     unsigned fastMemorySpace,

                                     unsigned tagMemorySpace,

                                     int minDmaTransferSize,

                                     uint64_t fastMemCapacityBytes) {

     this->slowMemorySpace = slowMemorySpace;

     this->fastMemorySpace = fastMemorySpace;

     this->tagMemorySpace = tagMemorySpace;

     this->minDmaTransferSize = minDmaTransferSize;

     this->fastMemoryCapacity = fastMemCapacityBytes / 1024;

   }


   void runOnOperation() override;

   void runOnBlock(Block *block, DenseSet<Operation *> &copyNests);


   // Constant zero index to avoid too many duplicates.

   Value zeroIndex = nullptr;

 };


 } // namespace


 /// Generates copies for memref's living in 'slowMemorySpace' into newly created

 /// buffers in 'fastMemorySpace', and replaces memory operations to the former

 /// by the latter.

 std::unique_ptr<OperationPass<func::FuncOp>>

 mlir::affine::createAffineDataCopyGenerationPass(

     unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,

     int minDmaTransferSize, uint64_t fastMemCapacityBytes) {

   return std::make_unique<AffineDataCopyGeneration>(

       slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,

       fastMemCapacityBytes);

 }

 std::unique_ptr<OperationPass<func::FuncOp>>

 mlir::affine::createAffineDataCopyGenerationPass() {

   return std::make_unique<AffineDataCopyGeneration>();

 }


 /// Generate copies for this block. The block is partitioned into separate

 /// ranges: each range is either a sequence of one or more operations starting

 /// and ending with an affine load or store op, or just an affine.for op (which

 /// could have other affine for op's nested within).

 void AffineDataCopyGeneration::runOnBlock(Block *block,

                                           DenseSet<Operation *> &copyNests) {

   if (block->empty())

     return;


   uint64_t fastMemCapacityBytes =

       fastMemoryCapacity != std::numeric_limits<uint64_t>::max()

           ? fastMemoryCapacity * 1024

           : fastMemoryCapacity;

   AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,

                                    fastMemorySpace, tagMemorySpace,

                                    fastMemCapacityBytes};


   // Every affine.for op in the block starts and ends a block range for copying;

   // in addition, a contiguous sequence of operations starting with a

   // load/store op but not including any copy nests themselves is also

   // identified as a copy block range. Straightline code (a contiguous chunk of

   // operations excluding AffineForOp's) are always assumed to not exhaust

   // memory. As a result, this approach is conservative in some cases at the

   // moment; we do a check later and report an error with location info.


   // Get to the first load, store, or for op (that is not a copy nest itself).

   auto curBegin = llvm::find_if(*block, [&](Operation &op) {

     return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&

            copyNests.count(&op) == 0;

   });


   // Create [begin, end) ranges.

   auto it = curBegin;

   while (it != block->end()) {

     AffineForOp forOp;

     // If you hit a non-copy for loop, we will split there.

     if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {

       // Perform the copying up unti this 'for' op first.

       (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,

                                    /*filterMemRef=*/std::nullopt, copyNests);


       // Returns true if the footprint is known to exceed capacity.

       auto exceedsCapacity = [&](AffineForOp forOp) {

         std::optional<int64_t> footprint =

             getMemoryFootprintBytes(forOp,

                                     /*memorySpace=*/0);

         return (footprint.has_value() &&

                 static_cast<uint64_t>(*footprint) > fastMemCapacityBytes);

       };


       // If the memory footprint of the 'affine.for' loop is higher than fast

       // memory capacity (when provided), we recurse to copy at an inner level

       // until we find a depth at which footprint fits in fast mem capacity. If

       // the footprint can't be calculated, we assume for now it fits. Recurse

       // inside if footprint for 'forOp' exceeds capacity, or when

       // skipNonUnitStrideLoops is set and the step size is not one.

       bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1

                                                  : exceedsCapacity(forOp);

       if (recurseInner) {

         // We'll recurse and do the copies at an inner level for 'forInst'.

         // Recurse onto the body of this loop.

         runOnBlock(forOp.getBody(), copyNests);

       } else {

         // We have enough capacity, i.e., copies will be computed for the

         // portion of the block until 'it', and for 'it', which is 'forOp'. Note

         // that for the latter, the copies are placed just before this loop (for

         // incoming copies) and right after (for outgoing ones).


         // Inner loop copies have their own scope - we don't thus update

         // consumed capacity. The footprint check above guarantees this inner

         // loop's footprint fits.

         (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it),

                                      copyOptions,

                                      /*filterMemRef=*/std::nullopt, copyNests);

       }

       // Get to the next load or store op after 'forOp'.

       curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {

         return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&

                copyNests.count(&op) == 0;

       });

       it = curBegin;

     } else {

       assert(copyNests.count(&*it) == 0 &&

              "all copy nests generated should have been skipped above");

       // We simply include this op in the current range and continue for more.

       ++it;

     }

   }


   // Generate the copy for the final block range.

   if (curBegin != block->end()) {

     // Can't be a terminator because it would have been skipped above.

     assert(!curBegin->hasTrait<OpTrait::IsTerminator>() &&

            "can't be a terminator");

     // Exclude the affine.yield - hence, the std::prev.

     (void)affineDataCopyGenerate(/*begin=*/curBegin,

                                  /*end=*/std::prev(block->end()), copyOptions,

                                  /*filterMemRef=*/std::nullopt, copyNests);

   }

 }


 void AffineDataCopyGeneration::runOnOperation() {

   func::FuncOp f = getOperation();

   OpBuilder topBuilder(f.getBody());

   zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);


   // Nests that are copy-in's or copy-out's; the root AffineForOps of those

   // nests are stored herein.

   DenseSet<Operation *> copyNests;


   // Clear recorded copy nests.

   copyNests.clear();


   for (auto &block : f)

     runOnBlock(&block, copyNests);


   // Promote any single iteration loops in the copy nests and collect

   // load/stores to simplify.

   SmallVector<Operation *, 4> copyOps;

   for (Operation *nest : copyNests)

     // With a post order walk, the erasure of loops does not affect

     // continuation of the walk or the collection of load/store ops.

     nest->walk([&](Operation *op) {

       if (auto forOp = dyn_cast<AffineForOp>(op))

         (void)promoteIfSingleIteration(forOp);

       else if (isa<AffineLoadOp, AffineStoreOp>(op))

         copyOps.push_back(op);

     });


   // Promoting single iteration loops could lead to simplification of

   // contained load's/store's, and the latter could anyway also be

   // canonicalized.

   RewritePatternSet patterns(&getContext());

   AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());

   AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());

   FrozenRewritePatternSet frozenPatterns(std::move(patterns));

   (void)applyOpPatternsGreedily(

       copyOps, frozenPatterns,

       GreedyRewriteConfig().setStrictness(

           GreedyRewriteStrictness::ExistingAndNewOps));

 }

AffineOps.h

Utils.h

Passes.h

FuncOps.h

GreedyPatternRewriteDriver.h

getContext
static MLIRContext * getContext(OpFoldResult val)
Definition: IndexingUtils.cpp:296

LoopUtils.h

max
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Definition: PolynomialApproximation.cpp:213

llvm::DenseSet
Definition: LLVM.h:59

llvm::SmallVector
Definition: LLVM.h:72

mlir::Block
Block represents an ordered list of Operations.
Definition: Block.h:33

mlir::Block::empty
bool empty()
Definition: Block.h:148

mlir::Block::end
iterator end()
Definition: Block.h:144

mlir::FrozenRewritePatternSet
This class represents a frozen set of patterns that can be processed by a pattern applicator.
Definition: FrozenRewritePatternSet.h:24

mlir::GreedyRewriteConfig
This class allows control over how the GreedyPatternRewriteDriver works.
Definition: GreedyPatternRewriteDriver.h:43

mlir::GreedyRewriteConfig::setStrictness
GreedyRewriteConfig & setStrictness(GreedyRewriteStrictness mode)
Definition: GreedyPatternRewriteDriver.h:113

mlir::OpBuilder
This class helps build Operations.
Definition: Builders.h:205

mlir::OpTrait::IsTerminator
This class provides the API for ops that are known to be terminators.
Definition: OpDefinition.h:772

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::RewritePatternSet
Definition: PatternMatch.h:771

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

Arith.h

MemRef.h

mlir::affine
Definition: AffineToStandard.h:23

mlir::affine::promoteIfSingleIteration
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:118

mlir::affine::affineDataCopyGenerate
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2306

mlir::affine::createAffineDataCopyGenerationPass
std::unique_ptr< OperationPass< func::FuncOp > > createAffineDataCopyGenerationPass(unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace=0, int minDmaTransferSize=1024, uint64_t fastMemCapacityBytes=std::numeric_limits< uint64_t >::max())
Performs packing (or explicit copying) of accessed memref regions into buffers in the specified faste...
Definition: AffineDataCopyGeneration.cpp:91

mlir::affine::getMemoryFootprintBytes
std::optional< int64_t > getMemoryFootprintBytes(AffineForOp forOp, int memorySpace=-1)
Gets the memory footprint of all data touched in the specified memory space in bytes; if the memory s...
Definition: Utils.cpp:2086

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::applyOpPatternsGreedily
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
Definition: GreedyPatternRewriteDriver.cpp:1016

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition: GreedyPatternRewriteDriver.h:283

mlir::GreedyRewriteStrictness::ExistingAndNewOps
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.

mlir::affine::AffineCopyOptions
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:165