24 #include "llvm/Support/Debug.h"
25 #include "llvm/Support/raw_ostream.h"
28 #define DEBUG_TYPE "loop-fusion-utils"
38 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op)) {
39 if (values.count(loadOp.getMemRef()) == 0)
40 values[loadOp.getMemRef()] =
false;
41 }
else if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
42 values[storeOp.getMemRef()] =
true;
52 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op)) {
53 return values.count(loadOp.getMemRef()) > 0 && values[loadOp.getMemRef()];
55 if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
56 return values.count(storeOp.getMemRef()) > 0;
105 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
117 if (llvm::is_contained(loops, cast<AffineForOp>(opB))) {
135 AffineForOp dstForOp) {
137 auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
138 auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
156 if (firstDepOpA->
isBeforeInBlock(lastDepOpB) || firstDepOpA == lastDepOpB)
174 bool hasIfOp =
false;
176 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
177 loadAndStoreOps.push_back(op);
178 else if (isa<AffineIfOp>(op))
201 auto loadOp = dyn_cast<AffineReadOpInterface>(dstOp);
202 Value memref = loadOp ? loadOp.getMemRef()
203 : cast<AffineWriteOpInterface>(dstOp).getMemRef();
204 if (producerConsumerMemrefs.count(memref) > 0)
205 targetDstOps.push_back(dstOp);
208 assert(!targetDstOps.empty() &&
209 "No dependences between 'srcForOp' and 'dstForOp'?");
215 if (all_of(targetDstOps, llvm::IsaPred<AffineReadOpInterface>))
220 for (
unsigned i = 0, e = targetDstOps.size(); i < e; ++i) {
223 for (
unsigned j = 0;
j < e; ++
j) {
224 auto *dstOpInst = targetDstOps[
j];
227 unsigned numCommonLoops =
229 for (
unsigned d = 1; d <= numCommonLoops + 1; ++d) {
236 loopDepth =
std::min(loopDepth, d - 1);
250 AffineForOp dstForOp,
251 unsigned dstLoopDepth,
255 if (dstLoopDepth == 0) {
256 LLVM_DEBUG(llvm::dbgs() <<
"Cannot fuse loop nests at depth 0\n");
260 auto *block = srcForOp->getBlock();
261 if (block != dstForOp->getBlock()) {
262 LLVM_DEBUG(llvm::dbgs() <<
"Cannot fuse loop nests in different blocks\n");
269 LLVM_DEBUG(llvm::dbgs() <<
"Fusion would violate dependences in block\n");
274 bool isSrcForOpBeforeDstForOp = srcForOp->isBeforeInBlock(dstForOp);
276 auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
277 auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
282 LLVM_DEBUG(llvm::dbgs() <<
"Fusing loops with affine.if unsupported\n");
289 LLVM_DEBUG(llvm::dbgs() <<
"Fusing loops with affine.if unsupported\n");
299 assert(isSrcForOpBeforeDstForOp &&
"Unexpected forward slice fusion");
301 LLVM_DEBUG(llvm::dbgs() <<
"Fusion would violate loop dependences\n");
307 unsigned numCommonLoops =
317 strategyOpsA.append(opsA.begin(), opsA.end());
323 if (isa<AffineWriteOpInterface>(op))
324 strategyOpsA.push_back(op);
331 auto load = dyn_cast<AffineReadOpInterface>(op);
333 strategyOpsA.push_back(op);
341 strategyOpsA, opsB, dstLoopDepth, numCommonLoops,
342 isSrcForOpBeforeDstForOp, srcSlice);
344 LLVM_DEBUG(llvm::dbgs() <<
"computeSliceUnion failed\n");
347 if (sliceComputationResult.
value ==
349 LLVM_DEBUG(llvm::dbgs() <<
"Incorrect slice computation\n");
359 bool siblingFusionUser) {
362 if (!tripCount || *tripCount != 1)
364 auto *parentOp = forOp->getParentOp();
365 if (!isa<AffineForOp>(parentOp))
368 llvm::append_range(newOperands,
369 forOp.getBody()->getTerminator()->getOperands());
371 int64_t parentOpNumResults = parentOp->getNumResults();
373 AffineForOp parentForOp = forOp->getParentOfType<AffineForOp>();
374 AffineForOp newLoop =
375 cast<AffineForOp>(*parentForOp.replaceWithAdditionalYields(
376 rewriter, forOp.getInits(),
false,
386 if (siblingFusionUser) {
387 for (
unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
390 forwardSlice.set_union(tmpForwardSlice);
394 for (
unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
395 forOp.getResult(i).replaceAllUsesWith(
396 newLoop.getResult(i + parentOpNumResults));
400 if (siblingFusionUser) {
402 for (
Operation *op : llvm::reverse(forwardSlice))
403 op->moveAfter(newLoop);
406 auto iv = forOp.getInductionVar();
407 iv.replaceAllUsesWith(newLoop.getInductionVar());
409 auto forOpIterArgs = forOp.getRegionIterArgs();
410 for (
auto it : llvm::zip(forOpIterArgs, newLoop.getRegionIterArgs().take_back(
411 forOpIterArgs.size()))) {
412 std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
416 forOp.getBody()->back().erase();
417 auto *parentBlock = forOp->getBlock();
419 forOp.getBody()->getOperations());
428 bool isInnermostSiblingInsertion) {
432 b.
clone(*srcForOp, mapper);
436 for (
unsigned i = 0, e = srcSlice.
ivs.size(); i < e; ++i) {
441 sliceLoops.push_back(forOp);
445 forOp.setLowerBound(lbOperands, lbMap);
450 forOp.setUpperBound(ubOperands, ubMap);
454 llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
455 auto srcIsUnitSlice = [&]() {
460 for (AffineForOp forOp : sliceLoops) {
462 isInnermostSiblingInsertion && srcIsUnitSlice())
477 auto walkResult = forOpRoot.walk([&](AffineForOp forOp) {
478 auto *childForOp = forOp.getOperation();
479 auto *parentForOp = forOp->getParentOp();
480 if (forOp != forOpRoot) {
481 if (!isa<AffineForOp>(parentForOp)) {
482 LLVM_DEBUG(llvm::dbgs() <<
"Expected parent AffineForOp\n");
486 stats->
loopMap[parentForOp].push_back(forOp);
492 for (
auto &op : *forOp.getBody()) {
493 if (!isa<AffineForOp, AffineIfOp>(op))
501 if (!maybeConstTripCount) {
503 LLVM_DEBUG(llvm::dbgs() <<
"Non-constant trip count unsupported\n");
510 return !walkResult.wasInterrupted();
530 llvm::SmallDenseMap<Operation *, uint64_t, 8> *tripCountOverrideMap,
534 int64_t opCount = stats.
opCountMap[forOp] - 1;
535 if (stats.
loopMap.count(forOp) > 0) {
536 for (
auto childForOp : stats.
loopMap[forOp]) {
542 if (computeCostMap) {
543 auto it = computeCostMap->find(forOp);
544 if (it != computeCostMap->end()) {
545 opCount += it->second;
550 if (tripCountOverrideMap) {
551 auto it = tripCountOverrideMap->find(forOp);
552 if (it != tripCountOverrideMap->end()) {
553 tripCount = it->second;
557 return tripCount * opCount;
577 AffineForOp dstForOp,
580 int64_t *computeCost) {
581 llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
589 assert(sliceIterationCount > 0);
590 bool storeLoadFwdGuaranteed = (sliceIterationCount == 1);
591 auto *insertPointParent = slice.
insertPoint->getParentOp();
594 if (storeLoadFwdGuaranteed) {
597 unsigned storeCount = 0;
598 llvm::SmallDenseSet<Value, 4> storeMemrefs;
599 srcForOp.walk([&](AffineWriteOpInterface storeOp) {
600 storeMemrefs.insert(storeOp.getMemRef());
605 computeCostMap[insertPointParent] = -storeCount;
608 for (
Value memref : storeMemrefs) {
610 if (!isa<AffineReadOpInterface>(user))
616 if (llvm::is_contained(loops, cast<AffineForOp>(insertPointParent))) {
617 if (
auto forOp = dyn_cast_or_null<AffineForOp>(user->getParentOp())) {
618 if (computeCostMap.count(forOp) == 0)
619 computeCostMap[forOp] = 0;
620 computeCostMap[forOp] -= 1;
629 srcForOp, srcStats, &sliceTripCountMap, &computeCostMap);
632 computeCostMap[insertPointParent] = sliceComputeCost;
636 nullptr, &computeCostMap);
649 if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op))
650 srcStoreMemRefs.insert(storeOp.getMemRef());
655 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op))
656 if (srcStoreMemRefs.count(loadOp.getMemRef()) > 0)
657 producerConsumerMemrefs.insert(loadOp.getMemRef());
static void getLoadAndStoreMemRefAccesses(Operation *opA, DenseMap< Value, bool > &values)
static bool gatherLoadsAndStores(AffineForOp forOp, SmallVectorImpl< Operation * > &loadAndStoreOps)
static Operation * getLastDependentOpInRange(Operation *opA, Operation *opB)
static int64_t getComputeCostHelper(Operation *forOp, LoopNestStats &stats, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountOverrideMap, DenseMap< Operation *, int64_t > *computeCostMap)
static Operation * getFirstDependentOpInRange(Operation *opA, Operation *opB)
static bool isDependentLoadOrStoreOp(Operation *op, DenseMap< Value, bool > &values)
Returns true if 'op' is a load or store operation which access a memref accessed 'values' and at leas...
static LogicalResult promoteSingleIterReductionLoop(AffineForOp forOp, bool siblingFusionUser)
Patch the loop body of a forOp that is a single iteration reduction loop into its containing block.
static Operation * getFusedLoopNestInsertionPoint(AffineForOp srcForOp, AffineForOp dstForOp)
static unsigned getMaxLoopDepth(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps)
Returns the maximum loop depth at which we could fuse producer loop 'srcForOp' into consumer loop 'ds...
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
OpListType::iterator iterator
OpListType::reverse_iterator reverse_iterator
This is a utility class for mapping one set of IR entities to another.
auto lookupOrNull(T from) const
Lookup a mapped value within the map.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Operation is the basic unit of execution within MLIR.
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
user_range getUsers()
Returns a range of all users.
result_range getResults()
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
static WalkResult advance()
static WalkResult interrupt()
Describes the fusion strategy to be used in the Affine loop fusion utilities.
StrategyEnum getStrategy() const
Returns the fusion strategy.
Value getSiblingFusionMemRef() const
Returns the memref attached to this sibling fusion strategy.
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
bool getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats, AffineForOp dstForOp, LoopNestStats &dstStats, const ComputationSliceState &slice, int64_t *computeCost)
Computes and returns in 'computeCost', the total compute cost of fusing the 'slice' of the loop nest ...
SliceComputationResult computeSliceUnion(ArrayRef< Operation * > opsA, ArrayRef< Operation * > opsB, unsigned loopDepth, unsigned numCommonLoops, bool isBackwardSlice, ComputationSliceState *sliceUnion)
Computes in 'sliceUnion' the union of all slice bounds computed at 'loopDepth' between all dependent ...
bool isLoopParallelAndContainsReduction(AffineForOp forOp)
Returns whether a loop is a parallel loop and contains a reduction loop.
unsigned getNumCommonSurroundingLoops(Operation &a, Operation &b)
Returns the number of surrounding loops common to both A and B.
void gatherProducerConsumerMemrefs(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps, DenseSet< Value > &producerConsumerMemrefs)
Returns in 'producerConsumerMemrefs' the memrefs involved in a producer-consumer dependence between w...
AffineForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
DependenceResult checkMemrefAccessDependence(const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, unsigned loopDepth, FlatAffineValueConstraints *dependenceConstraints=nullptr, SmallVector< DependenceComponent, 2 > *dependenceComponents=nullptr, bool allowRAR=false)
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
int64_t getComputeCost(AffineForOp forOp, LoopNestStats &stats)
Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
void fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, const ComputationSliceState &srcSlice, bool isInnermostSiblingInsertionFusion=false)
Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point and source slice loop bo...
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
unsigned getInnermostCommonLoopDepth(ArrayRef< Operation * > ops, SmallVectorImpl< AffineForOp > *surroundingLoops=nullptr)
Returns the innermost common loop depth for the set of operations in 'ops'.
bool getLoopNestStats(AffineForOp forOp, LoopNestStats *stats)
Collect loop nest statistics (eg.
bool hasDependence(DependenceResult result)
Utility function that returns true if the provided DependenceResult corresponds to a dependence resul...
uint64_t getSliceIterationCount(const llvm::SmallDenseMap< Operation *, uint64_t, 8 > &sliceTripCountMap)
Return the number of iterations for the slicetripCountMap provided.
FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, unsigned dstLoopDepth, ComputationSliceState *srcSlice, FusionStrategy fusionStrategy=FusionStrategy::Generic)
Checks the feasibility of fusing the loop nest rooted at 'srcForOp' into the loop nest rooted at 'dst...
bool buildSliceTripCountMap(const ComputationSliceState &slice, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountMap)
Builds a map 'tripCountMap' from AffineForOp to constant trip count for loop nest surrounding represe...
Include the generated interface declarations.
SetVector< Operation * > topologicalSort(const SetVector< Operation * > &toSort)
Sorts all operations in toSort topologically while also considering region semantics.
void getForwardSlice(Operation *op, SetVector< Operation * > *forwardSlice, const ForwardSliceOptions &options={})
Fills forwardSlice with the computed forward slice (i.e.
ComputationSliceState aggregates loop IVs, loop bound AffineMaps and their associated operands for a ...
SmallVector< Value, 4 > ivs
std::vector< SmallVector< Value, 4 > > ubOperands
SmallVector< AffineMap, 4 > ubs
SmallVector< AffineMap, 4 > lbs
Block::iterator insertPoint
std::vector< SmallVector< Value, 4 > > lbOperands
Checks whether two accesses to the same memref access the same element.
LoopNestStats aggregates various per-loop statistics (eg.
DenseMap< Operation *, uint64_t > opCountMap
Map from AffineForOp to count of operations in its loop body.
DenseMap< Operation *, SmallVector< AffineForOp, 2 > > loopMap
Map from AffineForOp to immediate child AffineForOps in its loop body.
DenseMap< Operation *, uint64_t > tripCountMap
Map from AffineForOp to its constant trip count.
Encapsulates a memref load or store access information.
Enumerates different result statuses of slice computation by computeSliceUnion
enum mlir::affine::SliceComputationResult::ResultEnum value
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.