23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/raw_ostream.h"
27 #define DEBUG_TYPE "loop-fusion-utils"
37 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op)) {
38 if (values.count(loadOp.getMemRef()) == 0)
39 values[loadOp.getMemRef()] =
false;
40 }
else if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
41 values[storeOp.getMemRef()] =
true;
51 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op)) {
52 return values.count(loadOp.getMemRef()) > 0 && values[loadOp.getMemRef()];
54 if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
55 return values.count(storeOp.getMemRef()) > 0;
104 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
116 if (llvm::is_contained(loops, cast<AffineForOp>(opB))) {
134 AffineForOp dstForOp) {
136 auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
137 auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
153 if (firstDepOpA !=
nullptr) {
154 if (lastDepOpB !=
nullptr) {
155 if (firstDepOpA->
isBeforeInBlock(lastDepOpB) || firstDepOpA == lastDepOpB)
173 bool hasIfOp =
false;
175 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
176 loadAndStoreOps.push_back(op);
177 else if (isa<AffineIfOp>(op))
200 auto loadOp = dyn_cast<AffineReadOpInterface>(dstOp);
201 Value memref = loadOp ? loadOp.getMemRef()
202 : cast<AffineWriteOpInterface>(dstOp).getMemRef();
203 if (producerConsumerMemrefs.count(memref) > 0)
204 targetDstOps.push_back(dstOp);
207 assert(!targetDstOps.empty() &&
208 "No dependences between 'srcForOp' and 'dstForOp'?");
214 if (all_of(targetDstOps,
215 [&](
Operation *op) {
return isa<AffineReadOpInterface>(op); }))
220 for (
unsigned i = 0, e = targetDstOps.size(); i < e; ++i) {
221 auto *srcOpInst = targetDstOps[i];
223 for (
unsigned j = 0;
j < e; ++
j) {
224 auto *dstOpInst = targetDstOps[
j];
227 unsigned numCommonLoops =
229 for (
unsigned d = 1; d <= numCommonLoops + 1; ++d) {
236 loopDepth =
std::min(loopDepth, d - 1);
251 AffineForOp dstForOp,
252 unsigned dstLoopDepth,
256 if (dstLoopDepth == 0) {
257 LLVM_DEBUG(llvm::dbgs() <<
"Cannot fuse loop nests at depth 0\n");
261 auto *block = srcForOp->getBlock();
262 if (block != dstForOp->getBlock()) {
263 LLVM_DEBUG(llvm::dbgs() <<
"Cannot fuse loop nests in different blocks\n");
270 LLVM_DEBUG(llvm::dbgs() <<
"Fusion would violate dependences in block\n");
275 bool isSrcForOpBeforeDstForOp = srcForOp->isBeforeInBlock(dstForOp);
277 auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
278 auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
283 LLVM_DEBUG(llvm::dbgs() <<
"Fusing loops with affine.if unsupported\n");
290 LLVM_DEBUG(llvm::dbgs() <<
"Fusing loops with affine.if unsupported\n");
300 assert(isSrcForOpBeforeDstForOp &&
"Unexpected forward slice fusion");
302 LLVM_DEBUG(llvm::dbgs() <<
"Fusion would violate loop dependences\n");
308 unsigned numCommonLoops =
318 strategyOpsA.append(opsA.begin(), opsA.end());
324 if (isa<AffineWriteOpInterface>(op))
325 strategyOpsA.push_back(op);
332 auto load = dyn_cast<AffineReadOpInterface>(op);
334 strategyOpsA.push_back(op);
342 strategyOpsA, opsB, dstLoopDepth, numCommonLoops,
343 isSrcForOpBeforeDstForOp, srcSlice);
345 LLVM_DEBUG(llvm::dbgs() <<
"computeSliceUnion failed\n");
348 if (sliceComputationResult.
value ==
350 LLVM_DEBUG(llvm::dbgs() <<
"Incorrect slice computation\n");
360 bool siblingFusionUser) {
363 if (!tripCount || *tripCount != 1)
365 auto *parentOp = forOp->getParentOp();
366 if (!isa<AffineForOp>(parentOp))
369 llvm::append_range(newOperands,
370 forOp.getBody()->getTerminator()->getOperands());
372 int64_t parentOpNumResults = parentOp->getNumResults();
374 AffineForOp parentForOp = forOp->getParentOfType<AffineForOp>();
375 AffineForOp newLoop =
376 cast<AffineForOp>(*parentForOp.replaceWithAdditionalYields(
377 rewriter, forOp.getInits(),
false,
387 if (siblingFusionUser) {
388 for (
unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
391 forwardSlice.set_union(tmpForwardSlice);
395 for (
unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
396 forOp.getResult(i).replaceAllUsesWith(
397 newLoop.getResult(i + parentOpNumResults));
401 if (siblingFusionUser) {
403 for (
Operation *op : llvm::reverse(forwardSlice))
407 auto iv = forOp.getInductionVar();
408 iv.replaceAllUsesWith(newLoop.getInductionVar());
410 auto forOpIterArgs = forOp.getRegionIterArgs();
411 for (
auto it : llvm::zip(forOpIterArgs, newLoop.getRegionIterArgs().take_back(
412 forOpIterArgs.size()))) {
413 std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
417 forOp.getBody()->back().erase();
418 auto *parentBlock = forOp->getBlock();
420 forOp.getBody()->getOperations());
429 bool isInnermostSiblingInsertion) {
433 b.
clone(*srcForOp, mapper);
437 for (
unsigned i = 0, e = srcSlice.
ivs.size(); i < e; ++i) {
442 sliceLoops.push_back(forOp);
446 forOp.setLowerBound(lbOperands, lbMap);
451 forOp.setUpperBound(ubOperands, ubMap);
455 llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
456 auto srcIsUnitSlice = [&]() {
461 for (AffineForOp forOp : sliceLoops) {
463 isInnermostSiblingInsertion && srcIsUnitSlice())
478 auto walkResult = forOpRoot.walk([&](AffineForOp forOp) {
479 auto *childForOp = forOp.getOperation();
480 auto *parentForOp = forOp->getParentOp();
481 if (forOp != forOpRoot) {
482 if (!isa<AffineForOp>(parentForOp)) {
483 LLVM_DEBUG(llvm::dbgs() <<
"Expected parent AffineForOp\n");
487 stats->
loopMap[parentForOp].push_back(forOp);
493 for (
auto &op : *forOp.getBody()) {
494 if (!isa<AffineForOp, AffineIfOp>(op))
502 if (!maybeConstTripCount) {
504 LLVM_DEBUG(llvm::dbgs() <<
"Non-constant trip count unsupported\n");
511 return !walkResult.wasInterrupted();
531 llvm::SmallDenseMap<Operation *, uint64_t, 8> *tripCountOverrideMap,
535 int64_t opCount = stats.
opCountMap[forOp] - 1;
536 if (stats.
loopMap.count(forOp) > 0) {
537 for (
auto childForOp : stats.
loopMap[forOp]) {
543 if (computeCostMap !=
nullptr) {
544 auto it = computeCostMap->find(forOp);
545 if (it != computeCostMap->end()) {
546 opCount += it->second;
551 if (tripCountOverrideMap !=
nullptr) {
552 auto it = tripCountOverrideMap->find(forOp);
553 if (it != tripCountOverrideMap->end()) {
554 tripCount = it->second;
558 return tripCount * opCount;
578 AffineForOp dstForOp,
581 int64_t *computeCost) {
582 llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
590 assert(sliceIterationCount > 0);
591 bool storeLoadFwdGuaranteed = (sliceIterationCount == 1);
592 auto *insertPointParent = slice.
insertPoint->getParentOp();
596 if (storeLoadFwdGuaranteed) {
599 unsigned storeCount = 0;
600 llvm::SmallDenseSet<Value, 4> storeMemrefs;
602 if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
603 storeMemrefs.insert(storeOp.getMemRef());
609 computeCostMap[insertPointParent] = -storeCount;
612 for (
Value memref : storeMemrefs) {
613 for (
auto *user : memref.getUsers()) {
614 if (dyn_cast<AffineReadOpInterface>(user)) {
619 if (llvm::is_contained(loops, cast<AffineForOp>(insertPointParent))) {
621 dyn_cast_or_null<AffineForOp>(user->getParentOp())) {
622 if (computeCostMap.count(forOp) == 0)
623 computeCostMap[forOp] = 0;
624 computeCostMap[forOp] -= 1;
634 srcForOp, srcStats, &sliceTripCountMap, &computeCostMap);
637 computeCostMap[insertPointParent] = sliceComputeCost;
641 nullptr, &computeCostMap);
654 if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op))
655 srcStoreMemRefs.insert(storeOp.getMemRef());
660 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op))
661 if (srcStoreMemRefs.count(loadOp.getMemRef()) > 0)
662 producerConsumerMemrefs.insert(loadOp.getMemRef());
static void getLoadAndStoreMemRefAccesses(Operation *opA, DenseMap< Value, bool > &values)
static bool gatherLoadsAndStores(AffineForOp forOp, SmallVectorImpl< Operation * > &loadAndStoreOps)
static Operation * getLastDependentOpInRange(Operation *opA, Operation *opB)
static int64_t getComputeCostHelper(Operation *forOp, LoopNestStats &stats, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountOverrideMap, DenseMap< Operation *, int64_t > *computeCostMap)
static Operation * getFirstDependentOpInRange(Operation *opA, Operation *opB)
static bool isDependentLoadOrStoreOp(Operation *op, DenseMap< Value, bool > &values)
Returns true if 'op' is a load or store operation which access a memref accessed 'values' and at leas...
static LogicalResult promoteSingleIterReductionLoop(AffineForOp forOp, bool siblingFusionUser)
Patch the loop body of a forOp that is a single iteration reduction loop into its containing block.
static Operation * getFusedLoopNestInsertionPoint(AffineForOp srcForOp, AffineForOp dstForOp)
static unsigned getMaxLoopDepth(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps)
Returns the maximum loop depth at which we could fuse producer loop 'srcForOp' into consumer loop 'ds...
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
OpListType::iterator iterator
OpListType::reverse_iterator reverse_iterator
This is a utility class for mapping one set of IR entities to another.
auto lookupOrNull(T from) const
Lookup a mapped value within the map.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Operation is the basic unit of execution within MLIR.
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
user_range getUsers()
Returns a range of all users.
result_range getResults()
void moveAfter(Operation *existingOp)
Unlink this operation from its current block and insert it right after existingOp which may be in the...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
static WalkResult advance()
static WalkResult interrupt()
Describes the fusion strategy to be used in the Affine loop fusion utilities.
StrategyEnum getStrategy() const
Returns the fusion strategy.
Value getSiblingFusionMemRef() const
Returns the memref attached to this sibling fusion strategy.
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
bool getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats, AffineForOp dstForOp, LoopNestStats &dstStats, const ComputationSliceState &slice, int64_t *computeCost)
Computes and returns in 'computeCost', the total compute cost of fusing the 'slice' of the loop nest ...
SliceComputationResult computeSliceUnion(ArrayRef< Operation * > opsA, ArrayRef< Operation * > opsB, unsigned loopDepth, unsigned numCommonLoops, bool isBackwardSlice, ComputationSliceState *sliceUnion)
Computes in 'sliceUnion' the union of all slice bounds computed at 'loopDepth' between all dependent ...
bool isLoopParallelAndContainsReduction(AffineForOp forOp)
Returns whether a loop is a parallel loop and contains a reduction loop.
unsigned getNumCommonSurroundingLoops(Operation &a, Operation &b)
Returns the number of surrounding loops common to both A and B.
void gatherProducerConsumerMemrefs(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps, DenseSet< Value > &producerConsumerMemrefs)
Returns in 'producerConsumerMemrefs' the memrefs involved in a producer-consumer dependence between w...
AffineForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
DependenceResult checkMemrefAccessDependence(const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, unsigned loopDepth, FlatAffineValueConstraints *dependenceConstraints=nullptr, SmallVector< DependenceComponent, 2 > *dependenceComponents=nullptr, bool allowRAR=false)
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
int64_t getComputeCost(AffineForOp forOp, LoopNestStats &stats)
Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
void fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, const ComputationSliceState &srcSlice, bool isInnermostSiblingInsertionFusion=false)
Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point and source slice loop bo...
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
unsigned getInnermostCommonLoopDepth(ArrayRef< Operation * > ops, SmallVectorImpl< AffineForOp > *surroundingLoops=nullptr)
Returns the innermost common loop depth for the set of operations in 'ops'.
bool getLoopNestStats(AffineForOp forOp, LoopNestStats *stats)
Collect loop nest statistics (eg.
bool hasDependence(DependenceResult result)
Utility function that returns true if the provided DependenceResult corresponds to a dependence resul...
uint64_t getSliceIterationCount(const llvm::SmallDenseMap< Operation *, uint64_t, 8 > &sliceTripCountMap)
Return the number of iterations for the slicetripCountMap provided.
FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, unsigned dstLoopDepth, ComputationSliceState *srcSlice, FusionStrategy fusionStrategy=FusionStrategy::Generic)
Checks the feasibility of fusing the loop nest rooted at 'srcForOp' into the loop nest rooted at 'dst...
bool buildSliceTripCountMap(const ComputationSliceState &slice, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountMap)
Builds a map 'tripCountMap' from AffineForOp to constant trip count for loop nest surrounding represe...
This header declares functions that assist transformations in the MemRef dialect.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
void getForwardSlice(Operation *op, SetVector< Operation * > *forwardSlice, ForwardSliceOptions options={})
Fills forwardSlice with the computed forward slice (i.e.
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
SetVector< Operation * > topologicalSort(const SetVector< Operation * > &toSort)
Multi-root DAG topological sort.
This class represents an efficient way to signal success or failure.
ComputationSliceState aggregates loop IVs, loop bound AffineMaps and their associated operands for a ...
SmallVector< Value, 4 > ivs
std::vector< SmallVector< Value, 4 > > ubOperands
SmallVector< AffineMap, 4 > ubs
SmallVector< AffineMap, 4 > lbs
Block::iterator insertPoint
std::vector< SmallVector< Value, 4 > > lbOperands
Checks whether two accesses to the same memref access the same element.
LoopNestStats aggregates various per-loop statistics (eg.
DenseMap< Operation *, uint64_t > opCountMap
Map from AffineForOp to count of operations in its loop body.
DenseMap< Operation *, SmallVector< AffineForOp, 2 > > loopMap
Map from AffineForOp to immediate child AffineForOps in its loop body.
DenseMap< Operation *, uint64_t > tripCountMap
Map from AffineForOp to its constant trip count.
Encapsulates a memref load or store access information.
Enumerates different result statuses of slice computation by computeSliceUnion
enum mlir::affine::SliceComputationResult::ResultEnum value
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.