24 #include "llvm/Support/Debug.h"
25 #include "llvm/Support/raw_ostream.h"
28 #define DEBUG_TYPE "affine-fusion-utils"
38 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op)) {
39 if (values.count(loadOp.getMemRef()) == 0)
40 values[loadOp.getMemRef()] =
false;
41 }
else if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
42 values[storeOp.getMemRef()] =
true;
52 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op))
53 return values.count(loadOp.getMemRef()) > 0 && values[loadOp.getMemRef()];
54 if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op))
55 return values.count(storeOp.getMemRef()) > 0;
103 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
115 if (llvm::is_contained(loops, cast<AffineForOp>(opB))) {
133 AffineForOp dstForOp) {
135 auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
136 auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
154 if (firstDepOpA->
isBeforeInBlock(lastDepOpB) || firstDepOpA == lastDepOpB)
172 bool hasIfOp =
false;
174 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
175 loadAndStoreOps.push_back(op);
176 else if (isa<AffineIfOp>(op))
199 auto loadOp = dyn_cast<AffineReadOpInterface>(dstOp);
200 Value memref = loadOp ? loadOp.getMemRef()
201 : cast<AffineWriteOpInterface>(dstOp).getMemRef();
202 if (producerConsumerMemrefs.count(memref) > 0)
203 targetDstOps.push_back(dstOp);
206 assert(!targetDstOps.empty() &&
207 "No dependences between 'srcForOp' and 'dstForOp'?");
213 if (all_of(targetDstOps, llvm::IsaPred<AffineReadOpInterface>))
218 for (
unsigned i = 0, e = targetDstOps.size(); i < e; ++i) {
221 for (
unsigned j = 0;
j < e; ++
j) {
222 auto *dstOpInst = targetDstOps[
j];
225 unsigned numCommonLoops =
227 for (
unsigned d = 1; d <= numCommonLoops + 1; ++d) {
234 loopDepth =
std::min(loopDepth, d - 1);
248 AffineForOp dstForOp,
249 unsigned dstLoopDepth,
253 if (dstLoopDepth == 0) {
254 LLVM_DEBUG(llvm::dbgs() <<
"Cannot fuse loop nests at depth 0\n");
258 auto *block = srcForOp->getBlock();
259 if (block != dstForOp->getBlock()) {
260 LLVM_DEBUG(llvm::dbgs() <<
"Cannot fuse loop nests in different blocks\n");
267 LLVM_DEBUG(llvm::dbgs() <<
"Fusion would violate dependences in block\n");
272 bool isSrcForOpBeforeDstForOp = srcForOp->isBeforeInBlock(dstForOp);
274 auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
275 auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
280 LLVM_DEBUG(llvm::dbgs() <<
"Fusing loops with affine.if unsupported\n");
287 LLVM_DEBUG(llvm::dbgs() <<
"Fusing loops with affine.if unsupported\n");
297 assert(isSrcForOpBeforeDstForOp &&
"Unexpected forward slice fusion");
299 LLVM_DEBUG(llvm::dbgs() <<
"Fusion would violate loop dependences\n");
305 unsigned numCommonLoops =
315 strategyOpsA.append(opsA.begin(), opsA.end());
321 if (isa<AffineWriteOpInterface>(op))
322 strategyOpsA.push_back(op);
329 auto load = dyn_cast<AffineReadOpInterface>(op);
331 strategyOpsA.push_back(op);
339 strategyOpsA, opsB, dstLoopDepth, numCommonLoops,
340 isSrcForOpBeforeDstForOp, srcSlice);
342 LLVM_DEBUG(llvm::dbgs() <<
"computeSliceUnion failed\n");
345 if (sliceComputationResult.
value ==
347 LLVM_DEBUG(llvm::dbgs() <<
"Incorrect slice computation\n");
357 bool siblingFusionUser) {
360 if (!tripCount || *tripCount != 1)
362 auto *parentOp = forOp->getParentOp();
363 if (!isa<AffineForOp>(parentOp))
366 llvm::append_range(newOperands,
367 forOp.getBody()->getTerminator()->getOperands());
369 int64_t parentOpNumResults = parentOp->getNumResults();
371 AffineForOp parentForOp = forOp->getParentOfType<AffineForOp>();
372 AffineForOp newLoop =
373 cast<AffineForOp>(*parentForOp.replaceWithAdditionalYields(
374 rewriter, forOp.getInits(),
false,
384 if (siblingFusionUser) {
385 for (
unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
388 forwardSlice.set_union(tmpForwardSlice);
392 for (
unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
393 forOp.getResult(i).replaceAllUsesWith(
394 newLoop.getResult(i + parentOpNumResults));
398 if (siblingFusionUser) {
400 for (
Operation *op : llvm::reverse(forwardSlice))
401 op->moveAfter(newLoop);
404 auto iv = forOp.getInductionVar();
405 iv.replaceAllUsesWith(newLoop.getInductionVar());
407 auto forOpIterArgs = forOp.getRegionIterArgs();
408 for (
auto it : llvm::zip(forOpIterArgs, newLoop.getRegionIterArgs().take_back(
409 forOpIterArgs.size()))) {
410 std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
414 forOp.getBody()->back().erase();
415 auto *parentBlock = forOp->getBlock();
417 forOp.getBody()->getOperations());
426 bool isInnermostSiblingInsertion) {
430 b.
clone(*srcForOp, mapper);
434 for (
unsigned i = 0, e = srcSlice.
ivs.size(); i < e; ++i) {
439 sliceLoops.push_back(forOp);
443 forOp.setLowerBound(lbOperands, lbMap);
448 forOp.setUpperBound(ubOperands, ubMap);
452 llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
453 auto srcIsUnitSlice = [&]() {
458 for (AffineForOp forOp : sliceLoops) {
460 isInnermostSiblingInsertion && srcIsUnitSlice())
475 auto walkResult = forOpRoot.walk([&](AffineForOp forOp) {
476 auto *childForOp = forOp.getOperation();
477 auto *parentForOp = forOp->getParentOp();
478 if (forOp != forOpRoot) {
479 if (!isa<AffineForOp>(parentForOp)) {
480 LLVM_DEBUG(llvm::dbgs() <<
"Expected parent AffineForOp\n");
484 stats->
loopMap[parentForOp].push_back(forOp);
490 for (
auto &op : *forOp.getBody()) {
491 if (!isa<AffineForOp, AffineIfOp>(op))
499 if (!maybeConstTripCount) {
501 LLVM_DEBUG(llvm::dbgs() <<
"Non-constant trip count unsupported\n");
508 return !walkResult.wasInterrupted();
528 llvm::SmallDenseMap<Operation *, uint64_t, 8> *tripCountOverrideMap,
532 int64_t opCount = stats.
opCountMap[forOp] - 1;
533 if (stats.
loopMap.count(forOp) > 0) {
534 for (
auto childForOp : stats.
loopMap[forOp]) {
540 if (computeCostMap) {
541 auto it = computeCostMap->find(forOp);
542 if (it != computeCostMap->end()) {
543 opCount += it->second;
548 if (tripCountOverrideMap) {
549 auto it = tripCountOverrideMap->find(forOp);
550 if (it != tripCountOverrideMap->end()) {
551 tripCount = it->second;
555 return tripCount * opCount;
575 AffineForOp dstForOp,
578 int64_t *computeCost) {
579 llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
587 assert(sliceIterationCount > 0);
588 bool storeLoadFwdGuaranteed = (sliceIterationCount == 1);
589 auto *insertPointParent = slice.
insertPoint->getParentOp();
592 if (storeLoadFwdGuaranteed) {
595 unsigned storeCount = 0;
596 llvm::SmallDenseSet<Value, 4> storeMemrefs;
597 srcForOp.walk([&](AffineWriteOpInterface storeOp) {
598 storeMemrefs.insert(storeOp.getMemRef());
603 computeCostMap[insertPointParent] = -storeCount;
606 for (
Value memref : storeMemrefs) {
608 if (!isa<AffineReadOpInterface>(user))
614 if (llvm::is_contained(loops, cast<AffineForOp>(insertPointParent))) {
615 if (
auto forOp = dyn_cast_or_null<AffineForOp>(user->getParentOp())) {
616 if (computeCostMap.count(forOp) == 0)
617 computeCostMap[forOp] = 0;
618 computeCostMap[forOp] -= 1;
627 srcForOp, srcStats, &sliceTripCountMap, &computeCostMap);
630 computeCostMap[insertPointParent] = sliceComputeCost;
634 nullptr, &computeCostMap);
647 if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op))
648 srcStoreMemRefs.insert(storeOp.getMemRef());
653 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op))
654 if (srcStoreMemRefs.count(loadOp.getMemRef()) > 0)
655 producerConsumerMemrefs.insert(loadOp.getMemRef());
static void getLoadAndStoreMemRefAccesses(Operation *opA, DenseMap< Value, bool > &values)
static bool gatherLoadsAndStores(AffineForOp forOp, SmallVectorImpl< Operation * > &loadAndStoreOps)
static Operation * getLastDependentOpInRange(Operation *opA, Operation *opB)
static int64_t getComputeCostHelper(Operation *forOp, LoopNestStats &stats, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountOverrideMap, DenseMap< Operation *, int64_t > *computeCostMap)
static Operation * getFirstDependentOpInRange(Operation *opA, Operation *opB)
static bool isDependentLoadOrStoreOp(Operation *op, DenseMap< Value, bool > &values)
Returns true if 'op' is a load or store operation which access a memref accessed 'values' and at leas...
static LogicalResult promoteSingleIterReductionLoop(AffineForOp forOp, bool siblingFusionUser)
Patch the loop body of a forOp that is a single iteration reduction loop into its containing block.
static Operation * getFusedLoopNestInsertionPoint(AffineForOp srcForOp, AffineForOp dstForOp)
static unsigned getMaxLoopDepth(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps)
Returns the maximum loop depth at which we could fuse producer loop 'srcForOp' into consumer loop 'ds...
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
OpListType::iterator iterator
OpListType::reverse_iterator reverse_iterator
This is a utility class for mapping one set of IR entities to another.
auto lookupOrNull(T from) const
Lookup a mapped value within the map.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Operation is the basic unit of execution within MLIR.
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
user_range getUsers()
Returns a range of all users.
result_range getResults()
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
static WalkResult advance()
static WalkResult interrupt()
Describes the fusion strategy to be used in the Affine loop fusion utilities.
StrategyEnum getStrategy() const
Returns the fusion strategy.
Value getSiblingFusionMemRef() const
Returns the memref attached to this sibling fusion strategy.
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
bool getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats, AffineForOp dstForOp, LoopNestStats &dstStats, const ComputationSliceState &slice, int64_t *computeCost)
Computes and returns in 'computeCost', the total compute cost of fusing the 'slice' of the loop nest ...
SliceComputationResult computeSliceUnion(ArrayRef< Operation * > opsA, ArrayRef< Operation * > opsB, unsigned loopDepth, unsigned numCommonLoops, bool isBackwardSlice, ComputationSliceState *sliceUnion)
Computes in 'sliceUnion' the union of all slice bounds computed at 'loopDepth' between all dependent ...
bool isLoopParallelAndContainsReduction(AffineForOp forOp)
Returns whether a loop is a parallel loop and contains a reduction loop.
unsigned getNumCommonSurroundingLoops(Operation &a, Operation &b)
Returns the number of surrounding loops common to both A and B.
void gatherProducerConsumerMemrefs(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps, DenseSet< Value > &producerConsumerMemrefs)
Returns in 'producerConsumerMemrefs' the memrefs involved in a producer-consumer dependence between w...
AffineForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
DependenceResult checkMemrefAccessDependence(const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, unsigned loopDepth, FlatAffineValueConstraints *dependenceConstraints=nullptr, SmallVector< DependenceComponent, 2 > *dependenceComponents=nullptr, bool allowRAR=false)
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
int64_t getComputeCost(AffineForOp forOp, LoopNestStats &stats)
Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
void fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, const ComputationSliceState &srcSlice, bool isInnermostSiblingInsertionFusion=false)
Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point and source slice loop bo...
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
unsigned getInnermostCommonLoopDepth(ArrayRef< Operation * > ops, SmallVectorImpl< AffineForOp > *surroundingLoops=nullptr)
Returns the innermost common loop depth for the set of operations in 'ops'.
bool getLoopNestStats(AffineForOp forOp, LoopNestStats *stats)
Collect loop nest statistics (eg.
bool hasDependence(DependenceResult result)
Utility function that returns true if the provided DependenceResult corresponds to a dependence resul...
uint64_t getSliceIterationCount(const llvm::SmallDenseMap< Operation *, uint64_t, 8 > &sliceTripCountMap)
Return the number of iterations for the slicetripCountMap provided.
FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, unsigned dstLoopDepth, ComputationSliceState *srcSlice, FusionStrategy fusionStrategy=FusionStrategy::Generic)
Checks the feasibility of fusing the loop nest rooted at 'srcForOp' into the loop nest rooted at 'dst...
bool buildSliceTripCountMap(const ComputationSliceState &slice, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountMap)
Builds a map 'tripCountMap' from AffineForOp to constant trip count for loop nest surrounding represe...
Include the generated interface declarations.
SetVector< Operation * > topologicalSort(const SetVector< Operation * > &toSort)
Sorts all operations in toSort topologically while also considering region semantics.
void getForwardSlice(Operation *op, SetVector< Operation * > *forwardSlice, const ForwardSliceOptions &options={})
Fills forwardSlice with the computed forward slice (i.e.
ComputationSliceState aggregates loop IVs, loop bound AffineMaps and their associated operands for a ...
SmallVector< Value, 4 > ivs
std::vector< SmallVector< Value, 4 > > ubOperands
SmallVector< AffineMap, 4 > ubs
SmallVector< AffineMap, 4 > lbs
Block::iterator insertPoint
std::vector< SmallVector< Value, 4 > > lbOperands
Checks whether two accesses to the same memref access the same element.
LoopNestStats aggregates various per-loop statistics (eg.
DenseMap< Operation *, uint64_t > opCountMap
Map from AffineForOp to count of operations in its loop body.
DenseMap< Operation *, SmallVector< AffineForOp, 2 > > loopMap
Map from AffineForOp to immediate child AffineForOps in its loop body.
DenseMap< Operation *, uint64_t > tripCountMap
Map from AffineForOp to its constant trip count.
Encapsulates a memref load or store access information.
Enumerates different result statuses of slice computation by computeSliceUnion
enum mlir::affine::SliceComputationResult::ResultEnum value
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.