23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/raw_ostream.h"
27 #define DEBUG_TYPE "loop-fusion-utils"
37 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op)) {
38 if (values.count(loadOp.getMemRef()) == 0)
39 values[loadOp.getMemRef()] =
false;
40 }
else if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
41 values[storeOp.getMemRef()] =
true;
51 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op)) {
52 return values.count(loadOp.getMemRef()) > 0 && values[loadOp.getMemRef()];
54 if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
55 return values.count(storeOp.getMemRef()) > 0;
104 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
116 if (llvm::is_contained(loops, cast<AffineForOp>(opB))) {
134 AffineForOp dstForOp) {
136 auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
137 auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
155 if (firstDepOpA->
isBeforeInBlock(lastDepOpB) || firstDepOpA == lastDepOpB)
173 bool hasIfOp =
false;
175 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
176 loadAndStoreOps.push_back(op);
177 else if (isa<AffineIfOp>(op))
200 auto loadOp = dyn_cast<AffineReadOpInterface>(dstOp);
201 Value memref = loadOp ? loadOp.getMemRef()
202 : cast<AffineWriteOpInterface>(dstOp).getMemRef();
203 if (producerConsumerMemrefs.count(memref) > 0)
204 targetDstOps.push_back(dstOp);
207 assert(!targetDstOps.empty() &&
208 "No dependences between 'srcForOp' and 'dstForOp'?");
214 if (all_of(targetDstOps, llvm::IsaPred<AffineReadOpInterface>))
219 for (
unsigned i = 0, e = targetDstOps.size(); i < e; ++i) {
222 for (
unsigned j = 0;
j < e; ++
j) {
223 auto *dstOpInst = targetDstOps[
j];
226 unsigned numCommonLoops =
228 for (
unsigned d = 1; d <= numCommonLoops + 1; ++d) {
235 loopDepth =
std::min(loopDepth, d - 1);
249 AffineForOp dstForOp,
250 unsigned dstLoopDepth,
254 if (dstLoopDepth == 0) {
255 LLVM_DEBUG(llvm::dbgs() <<
"Cannot fuse loop nests at depth 0\n");
259 auto *block = srcForOp->getBlock();
260 if (block != dstForOp->getBlock()) {
261 LLVM_DEBUG(llvm::dbgs() <<
"Cannot fuse loop nests in different blocks\n");
268 LLVM_DEBUG(llvm::dbgs() <<
"Fusion would violate dependences in block\n");
273 bool isSrcForOpBeforeDstForOp = srcForOp->isBeforeInBlock(dstForOp);
275 auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
276 auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
281 LLVM_DEBUG(llvm::dbgs() <<
"Fusing loops with affine.if unsupported\n");
288 LLVM_DEBUG(llvm::dbgs() <<
"Fusing loops with affine.if unsupported\n");
298 assert(isSrcForOpBeforeDstForOp &&
"Unexpected forward slice fusion");
300 LLVM_DEBUG(llvm::dbgs() <<
"Fusion would violate loop dependences\n");
306 unsigned numCommonLoops =
316 strategyOpsA.append(opsA.begin(), opsA.end());
322 if (isa<AffineWriteOpInterface>(op))
323 strategyOpsA.push_back(op);
330 auto load = dyn_cast<AffineReadOpInterface>(op);
332 strategyOpsA.push_back(op);
340 strategyOpsA, opsB, dstLoopDepth, numCommonLoops,
341 isSrcForOpBeforeDstForOp, srcSlice);
343 LLVM_DEBUG(llvm::dbgs() <<
"computeSliceUnion failed\n");
346 if (sliceComputationResult.
value ==
348 LLVM_DEBUG(llvm::dbgs() <<
"Incorrect slice computation\n");
358 bool siblingFusionUser) {
361 if (!tripCount || *tripCount != 1)
363 auto *parentOp = forOp->getParentOp();
364 if (!isa<AffineForOp>(parentOp))
367 llvm::append_range(newOperands,
368 forOp.getBody()->getTerminator()->getOperands());
370 int64_t parentOpNumResults = parentOp->getNumResults();
372 AffineForOp parentForOp = forOp->getParentOfType<AffineForOp>();
373 AffineForOp newLoop =
374 cast<AffineForOp>(*parentForOp.replaceWithAdditionalYields(
375 rewriter, forOp.getInits(),
false,
385 if (siblingFusionUser) {
386 for (
unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
389 forwardSlice.set_union(tmpForwardSlice);
393 for (
unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
394 forOp.getResult(i).replaceAllUsesWith(
395 newLoop.getResult(i + parentOpNumResults));
399 if (siblingFusionUser) {
401 for (
Operation *op : llvm::reverse(forwardSlice))
405 auto iv = forOp.getInductionVar();
406 iv.replaceAllUsesWith(newLoop.getInductionVar());
408 auto forOpIterArgs = forOp.getRegionIterArgs();
409 for (
auto it : llvm::zip(forOpIterArgs, newLoop.getRegionIterArgs().take_back(
410 forOpIterArgs.size()))) {
411 std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
415 forOp.getBody()->back().erase();
416 auto *parentBlock = forOp->getBlock();
418 forOp.getBody()->getOperations());
427 bool isInnermostSiblingInsertion) {
431 b.
clone(*srcForOp, mapper);
435 for (
unsigned i = 0, e = srcSlice.
ivs.size(); i < e; ++i) {
440 sliceLoops.push_back(forOp);
444 forOp.setLowerBound(lbOperands, lbMap);
449 forOp.setUpperBound(ubOperands, ubMap);
453 llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
454 auto srcIsUnitSlice = [&]() {
459 for (AffineForOp forOp : sliceLoops) {
461 isInnermostSiblingInsertion && srcIsUnitSlice())
476 auto walkResult = forOpRoot.walk([&](AffineForOp forOp) {
477 auto *childForOp = forOp.getOperation();
478 auto *parentForOp = forOp->getParentOp();
479 if (forOp != forOpRoot) {
480 if (!isa<AffineForOp>(parentForOp)) {
481 LLVM_DEBUG(llvm::dbgs() <<
"Expected parent AffineForOp\n");
485 stats->
loopMap[parentForOp].push_back(forOp);
491 for (
auto &op : *forOp.getBody()) {
492 if (!isa<AffineForOp, AffineIfOp>(op))
500 if (!maybeConstTripCount) {
502 LLVM_DEBUG(llvm::dbgs() <<
"Non-constant trip count unsupported\n");
509 return !walkResult.wasInterrupted();
529 llvm::SmallDenseMap<Operation *, uint64_t, 8> *tripCountOverrideMap,
533 int64_t opCount = stats.
opCountMap[forOp] - 1;
534 if (stats.
loopMap.count(forOp) > 0) {
535 for (
auto childForOp : stats.
loopMap[forOp]) {
541 if (computeCostMap) {
542 auto it = computeCostMap->find(forOp);
543 if (it != computeCostMap->end()) {
544 opCount += it->second;
549 if (tripCountOverrideMap) {
550 auto it = tripCountOverrideMap->find(forOp);
551 if (it != tripCountOverrideMap->end()) {
552 tripCount = it->second;
556 return tripCount * opCount;
576 AffineForOp dstForOp,
579 int64_t *computeCost) {
580 llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
588 assert(sliceIterationCount > 0);
589 bool storeLoadFwdGuaranteed = (sliceIterationCount == 1);
590 auto *insertPointParent = slice.
insertPoint->getParentOp();
593 if (storeLoadFwdGuaranteed) {
596 unsigned storeCount = 0;
597 llvm::SmallDenseSet<Value, 4> storeMemrefs;
598 srcForOp.walk([&](AffineWriteOpInterface storeOp) {
599 storeMemrefs.insert(storeOp.getMemRef());
604 computeCostMap[insertPointParent] = -storeCount;
607 for (
Value memref : storeMemrefs) {
609 if (!isa<AffineReadOpInterface>(user))
615 if (llvm::is_contained(loops, cast<AffineForOp>(insertPointParent))) {
616 if (
auto forOp = dyn_cast_or_null<AffineForOp>(user->getParentOp())) {
617 if (computeCostMap.count(forOp) == 0)
618 computeCostMap[forOp] = 0;
619 computeCostMap[forOp] -= 1;
628 srcForOp, srcStats, &sliceTripCountMap, &computeCostMap);
631 computeCostMap[insertPointParent] = sliceComputeCost;
635 nullptr, &computeCostMap);
648 if (
auto storeOp = dyn_cast<AffineWriteOpInterface>(op))
649 srcStoreMemRefs.insert(storeOp.getMemRef());
654 if (
auto loadOp = dyn_cast<AffineReadOpInterface>(op))
655 if (srcStoreMemRefs.count(loadOp.getMemRef()) > 0)
656 producerConsumerMemrefs.insert(loadOp.getMemRef());
static void getLoadAndStoreMemRefAccesses(Operation *opA, DenseMap< Value, bool > &values)
static bool gatherLoadsAndStores(AffineForOp forOp, SmallVectorImpl< Operation * > &loadAndStoreOps)
static Operation * getLastDependentOpInRange(Operation *opA, Operation *opB)
static int64_t getComputeCostHelper(Operation *forOp, LoopNestStats &stats, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountOverrideMap, DenseMap< Operation *, int64_t > *computeCostMap)
static Operation * getFirstDependentOpInRange(Operation *opA, Operation *opB)
static bool isDependentLoadOrStoreOp(Operation *op, DenseMap< Value, bool > &values)
Returns true if 'op' is a load or store operation which access a memref accessed 'values' and at leas...
static LogicalResult promoteSingleIterReductionLoop(AffineForOp forOp, bool siblingFusionUser)
Patch the loop body of a forOp that is a single iteration reduction loop into its containing block.
static Operation * getFusedLoopNestInsertionPoint(AffineForOp srcForOp, AffineForOp dstForOp)
static unsigned getMaxLoopDepth(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps)
Returns the maximum loop depth at which we could fuse producer loop 'srcForOp' into consumer loop 'ds...
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
OpListType::iterator iterator
OpListType::reverse_iterator reverse_iterator
This is a utility class for mapping one set of IR entities to another.
auto lookupOrNull(T from) const
Lookup a mapped value within the map.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Operation is the basic unit of execution within MLIR.
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
user_range getUsers()
Returns a range of all users.
result_range getResults()
void moveAfter(Operation *existingOp)
Unlink this operation from its current block and insert it right after existingOp which may be in the...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
static WalkResult advance()
static WalkResult interrupt()
Describes the fusion strategy to be used in the Affine loop fusion utilities.
StrategyEnum getStrategy() const
Returns the fusion strategy.
Value getSiblingFusionMemRef() const
Returns the memref attached to this sibling fusion strategy.
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
bool getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats, AffineForOp dstForOp, LoopNestStats &dstStats, const ComputationSliceState &slice, int64_t *computeCost)
Computes and returns in 'computeCost', the total compute cost of fusing the 'slice' of the loop nest ...
SliceComputationResult computeSliceUnion(ArrayRef< Operation * > opsA, ArrayRef< Operation * > opsB, unsigned loopDepth, unsigned numCommonLoops, bool isBackwardSlice, ComputationSliceState *sliceUnion)
Computes in 'sliceUnion' the union of all slice bounds computed at 'loopDepth' between all dependent ...
bool isLoopParallelAndContainsReduction(AffineForOp forOp)
Returns whether a loop is a parallel loop and contains a reduction loop.
unsigned getNumCommonSurroundingLoops(Operation &a, Operation &b)
Returns the number of surrounding loops common to both A and B.
void gatherProducerConsumerMemrefs(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps, DenseSet< Value > &producerConsumerMemrefs)
Returns in 'producerConsumerMemrefs' the memrefs involved in a producer-consumer dependence between w...
AffineForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
DependenceResult checkMemrefAccessDependence(const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, unsigned loopDepth, FlatAffineValueConstraints *dependenceConstraints=nullptr, SmallVector< DependenceComponent, 2 > *dependenceComponents=nullptr, bool allowRAR=false)
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
int64_t getComputeCost(AffineForOp forOp, LoopNestStats &stats)
Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
void fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, const ComputationSliceState &srcSlice, bool isInnermostSiblingInsertionFusion=false)
Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point and source slice loop bo...
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
unsigned getInnermostCommonLoopDepth(ArrayRef< Operation * > ops, SmallVectorImpl< AffineForOp > *surroundingLoops=nullptr)
Returns the innermost common loop depth for the set of operations in 'ops'.
bool getLoopNestStats(AffineForOp forOp, LoopNestStats *stats)
Collect loop nest statistics (eg.
bool hasDependence(DependenceResult result)
Utility function that returns true if the provided DependenceResult corresponds to a dependence resul...
uint64_t getSliceIterationCount(const llvm::SmallDenseMap< Operation *, uint64_t, 8 > &sliceTripCountMap)
Return the number of iterations for the slicetripCountMap provided.
FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, unsigned dstLoopDepth, ComputationSliceState *srcSlice, FusionStrategy fusionStrategy=FusionStrategy::Generic)
Checks the feasibility of fusing the loop nest rooted at 'srcForOp' into the loop nest rooted at 'dst...
bool buildSliceTripCountMap(const ComputationSliceState &slice, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountMap)
Builds a map 'tripCountMap' from AffineForOp to constant trip count for loop nest surrounding represe...
Include the generated interface declarations.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
SetVector< Operation * > topologicalSort(const SetVector< Operation * > &toSort)
Multi-root DAG topological sort.
void getForwardSlice(Operation *op, SetVector< Operation * > *forwardSlice, const ForwardSliceOptions &options={})
Fills forwardSlice with the computed forward slice (i.e.
This class represents an efficient way to signal success or failure.
ComputationSliceState aggregates loop IVs, loop bound AffineMaps and their associated operands for a ...
SmallVector< Value, 4 > ivs
std::vector< SmallVector< Value, 4 > > ubOperands
SmallVector< AffineMap, 4 > ubs
SmallVector< AffineMap, 4 > lbs
Block::iterator insertPoint
std::vector< SmallVector< Value, 4 > > lbOperands
Checks whether two accesses to the same memref access the same element.
LoopNestStats aggregates various per-loop statistics (eg.
DenseMap< Operation *, uint64_t > opCountMap
Map from AffineForOp to count of operations in its loop body.
DenseMap< Operation *, SmallVector< AffineForOp, 2 > > loopMap
Map from AffineForOp to immediate child AffineForOps in its loop body.
DenseMap< Operation *, uint64_t > tripCountMap
Map from AffineForOp to its constant trip count.
Encapsulates a memref load or store access information.
Enumerates different result statuses of slice computation by computeSliceUnion
enum mlir::affine::SliceComputationResult::ResultEnum value
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.