MLIR  18.0.0git
LoopFusionUtils.cpp
Go to the documentation of this file.
1 //===- LoopFusionUtils.cpp ---- Utilities for loop fusion ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop fusion transformation utility functions.
10 //
11 //===----------------------------------------------------------------------===//
12 
20 #include "mlir/IR/IRMapping.h"
21 #include "mlir/IR/Operation.h"
22 #include "mlir/IR/PatternMatch.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include <optional>
26 
27 #define DEBUG_TYPE "loop-fusion-utils"
28 
29 using namespace mlir;
30 using namespace mlir::affine;
31 
32 // Gathers all load and store memref accesses in 'opA' into 'values', where
33 // 'values[memref] == true' for each store operation.
35  DenseMap<Value, bool> &values) {
36  opA->walk([&](Operation *op) {
37  if (auto loadOp = dyn_cast<AffineReadOpInterface>(op)) {
38  if (values.count(loadOp.getMemRef()) == 0)
39  values[loadOp.getMemRef()] = false;
40  } else if (auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
41  values[storeOp.getMemRef()] = true;
42  }
43  });
44 }
45 
46 /// Returns true if 'op' is a load or store operation which access a memref
47 /// accessed 'values' and at least one of the access is a store operation.
48 /// Returns false otherwise.
50  DenseMap<Value, bool> &values) {
51  if (auto loadOp = dyn_cast<AffineReadOpInterface>(op)) {
52  return values.count(loadOp.getMemRef()) > 0 && values[loadOp.getMemRef()];
53  }
54  if (auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
55  return values.count(storeOp.getMemRef()) > 0;
56  }
57  return false;
58 }
59 
60 // Returns the first operation in range ('opA', 'opB') which has a data
61 // dependence on 'opA'. Returns 'nullptr' of no dependence exists.
63  // Record memref values from all loads/store in loop nest rooted at 'opA'.
64  // Map from memref value to bool which is true if store, false otherwise.
65  DenseMap<Value, bool> values;
66  getLoadAndStoreMemRefAccesses(opA, values);
67 
68  // For each 'opX' in block in range ('opA', 'opB'), check if there is a data
69  // dependence from 'opA' to 'opX' ('opA' and 'opX' access the same memref
70  // and at least one of the accesses is a store).
71  Operation *firstDepOp = nullptr;
72  for (Block::iterator it = std::next(Block::iterator(opA));
73  it != Block::iterator(opB); ++it) {
74  Operation *opX = &(*it);
75  opX->walk([&](Operation *op) {
76  if (!firstDepOp && isDependentLoadOrStoreOp(op, values))
77  firstDepOp = opX;
78  });
79  if (firstDepOp)
80  break;
81  }
82  return firstDepOp;
83 }
84 
85 // Returns the last operation 'opX' in range ('opA', 'opB'), for which there
86 // exists a data dependence from 'opX' to 'opB'.
87 // Returns 'nullptr' of no dependence exists.
89  // Record memref values from all loads/store in loop nest rooted at 'opB'.
90  // Map from memref value to bool which is true if store, false otherwise.
91  DenseMap<Value, bool> values;
92  getLoadAndStoreMemRefAccesses(opB, values);
93 
94  // For each 'opX' in block in range ('opA', 'opB') in reverse order,
95  // check if there is a data dependence from 'opX' to 'opB':
96  // *) 'opX' and 'opB' access the same memref and at least one of the accesses
97  // is a store.
98  // *) 'opX' produces an SSA Value which is used by 'opB'.
99  Operation *lastDepOp = nullptr;
100  for (Block::reverse_iterator it = std::next(Block::reverse_iterator(opB));
101  it != Block::reverse_iterator(opA); ++it) {
102  Operation *opX = &(*it);
103  opX->walk([&](Operation *op) {
104  if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
105  if (isDependentLoadOrStoreOp(op, values)) {
106  lastDepOp = opX;
107  return WalkResult::interrupt();
108  }
109  return WalkResult::advance();
110  }
111  for (Value value : op->getResults()) {
112  for (Operation *user : value.getUsers()) {
114  // Check if any loop in loop nest surrounding 'user' is 'opB'.
115  getAffineForIVs(*user, &loops);
116  if (llvm::is_contained(loops, cast<AffineForOp>(opB))) {
117  lastDepOp = opX;
118  return WalkResult::interrupt();
119  }
120  }
121  }
122  return WalkResult::advance();
123  });
124  if (lastDepOp)
125  break;
126  }
127  return lastDepOp;
128 }
129 
130 // Computes and returns an insertion point operation, before which the
131 // the fused <srcForOp, dstForOp> loop nest can be inserted while preserving
132 // dependences. Returns nullptr if no such insertion point is found.
133 static Operation *getFusedLoopNestInsertionPoint(AffineForOp srcForOp,
134  AffineForOp dstForOp) {
135  bool isSrcForOpBeforeDstForOp = srcForOp->isBeforeInBlock(dstForOp);
136  auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
137  auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
138 
139  Operation *firstDepOpA = getFirstDependentOpInRange(forOpA, forOpB);
140  Operation *lastDepOpB = getLastDependentOpInRange(forOpA, forOpB);
141  // Block:
142  // ...
143  // |-- opA
144  // | ...
145  // | lastDepOpB --|
146  // | ... |
147  // |-> firstDepOpA |
148  // ... |
149  // opB <---------
150  //
151  // Valid insertion point range: (lastDepOpB, firstDepOpA)
152  //
153  if (firstDepOpA != nullptr) {
154  if (lastDepOpB != nullptr) {
155  if (firstDepOpA->isBeforeInBlock(lastDepOpB) || firstDepOpA == lastDepOpB)
156  // No valid insertion point exists which preserves dependences.
157  return nullptr;
158  }
159  // Return insertion point in valid range closest to 'opB'.
160  // TODO: Consider other insertion points in valid range.
161  return firstDepOpA;
162  }
163  // No dependences from 'opA' to operation in range ('opA', 'opB'), return
164  // 'opB' insertion point.
165  return forOpB;
166 }
167 
168 // Gathers all load and store ops in loop nest rooted at 'forOp' into
169 // 'loadAndStoreOps'.
170 static bool
171 gatherLoadsAndStores(AffineForOp forOp,
172  SmallVectorImpl<Operation *> &loadAndStoreOps) {
173  bool hasIfOp = false;
174  forOp.walk([&](Operation *op) {
175  if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
176  loadAndStoreOps.push_back(op);
177  else if (isa<AffineIfOp>(op))
178  hasIfOp = true;
179  });
180  return !hasIfOp;
181 }
182 
183 /// Returns the maximum loop depth at which we could fuse producer loop
184 /// 'srcForOp' into consumer loop 'dstForOp' without violating data dependences.
185 // TODO: Generalize this check for sibling and more generic fusion scenarios.
186 // TODO: Support forward slice fusion.
187 static unsigned getMaxLoopDepth(ArrayRef<Operation *> srcOps,
188  ArrayRef<Operation *> dstOps) {
189  if (dstOps.empty())
190  // Expected at least one memory operation.
191  // TODO: Revisit this case with a specific example.
192  return 0;
193 
194  // Filter out ops in 'dstOps' that do not use the producer-consumer memref so
195  // that they are not considered for analysis.
196  DenseSet<Value> producerConsumerMemrefs;
197  gatherProducerConsumerMemrefs(srcOps, dstOps, producerConsumerMemrefs);
198  SmallVector<Operation *, 4> targetDstOps;
199  for (Operation *dstOp : dstOps) {
200  auto loadOp = dyn_cast<AffineReadOpInterface>(dstOp);
201  Value memref = loadOp ? loadOp.getMemRef()
202  : cast<AffineWriteOpInterface>(dstOp).getMemRef();
203  if (producerConsumerMemrefs.count(memref) > 0)
204  targetDstOps.push_back(dstOp);
205  }
206 
207  assert(!targetDstOps.empty() &&
208  "No dependences between 'srcForOp' and 'dstForOp'?");
209 
210  // Compute the innermost common loop depth for loads and stores.
211  unsigned loopDepth = getInnermostCommonLoopDepth(targetDstOps);
212 
213  // Return common loop depth for loads if there are no store ops.
214  if (all_of(targetDstOps,
215  [&](Operation *op) { return isa<AffineReadOpInterface>(op); }))
216  return loopDepth;
217 
218  // Check dependences on all pairs of ops in 'targetDstOps' and store the
219  // minimum loop depth at which a dependence is satisfied.
220  for (unsigned i = 0, e = targetDstOps.size(); i < e; ++i) {
221  auto *srcOpInst = targetDstOps[i];
222  MemRefAccess srcAccess(srcOpInst);
223  for (unsigned j = 0; j < e; ++j) {
224  auto *dstOpInst = targetDstOps[j];
225  MemRefAccess dstAccess(dstOpInst);
226 
227  unsigned numCommonLoops =
228  getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst);
229  for (unsigned d = 1; d <= numCommonLoops + 1; ++d) {
230  // TODO: Cache dependence analysis results, check cache here.
231  DependenceResult result =
232  checkMemrefAccessDependence(srcAccess, dstAccess, d);
233  if (hasDependence(result)) {
234  // Store minimum loop depth and break because we want the min 'd' at
235  // which there is a dependence.
236  loopDepth = std::min(loopDepth, d - 1);
237  break;
238  }
239  }
240  }
241  }
242 
243  return loopDepth;
244 }
245 
246 // TODO: Prevent fusion of loop nests with side-effecting operations.
247 // TODO: This pass performs some computation that is the same for all the depths
248 // (e.g., getMaxLoopDepth). Implement a version of this utility that processes
249 // all the depths at once or only the legal maximal depth for maximal fusion.
251  AffineForOp dstForOp,
252  unsigned dstLoopDepth,
253  ComputationSliceState *srcSlice,
254  FusionStrategy fusionStrategy) {
255  // Return 'failure' if 'dstLoopDepth == 0'.
256  if (dstLoopDepth == 0) {
257  LLVM_DEBUG(llvm::dbgs() << "Cannot fuse loop nests at depth 0\n");
259  }
260  // Return 'failure' if 'srcForOp' and 'dstForOp' are not in the same block.
261  auto *block = srcForOp->getBlock();
262  if (block != dstForOp->getBlock()) {
263  LLVM_DEBUG(llvm::dbgs() << "Cannot fuse loop nests in different blocks\n");
265  }
266 
267  // Return 'failure' if no valid insertion point for fused loop nest in 'block'
268  // exists which would preserve dependences.
269  if (!getFusedLoopNestInsertionPoint(srcForOp, dstForOp)) {
270  LLVM_DEBUG(llvm::dbgs() << "Fusion would violate dependences in block\n");
272  }
273 
274  // Check if 'srcForOp' precedes 'dstForOp' in 'block'.
275  bool isSrcForOpBeforeDstForOp = srcForOp->isBeforeInBlock(dstForOp);
276  // 'forOpA' executes before 'forOpB' in 'block'.
277  auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
278  auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
279 
280  // Gather all load and store from 'forOpA' which precedes 'forOpB' in 'block'.
282  if (!gatherLoadsAndStores(forOpA, opsA)) {
283  LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported\n");
285  }
286 
287  // Gather all load and store from 'forOpB' which succeeds 'forOpA' in 'block'.
289  if (!gatherLoadsAndStores(forOpB, opsB)) {
290  LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported\n");
292  }
293 
294  // Return 'failure' if fusing loops at depth 'dstLoopDepth' wouldn't preserve
295  // loop dependences.
296  // TODO: Enable this check for sibling and more generic loop fusion
297  // strategies.
298  if (fusionStrategy.getStrategy() == FusionStrategy::ProducerConsumer) {
299  // TODO: 'getMaxLoopDepth' does not support forward slice fusion.
300  assert(isSrcForOpBeforeDstForOp && "Unexpected forward slice fusion");
301  if (getMaxLoopDepth(opsA, opsB) < dstLoopDepth) {
302  LLVM_DEBUG(llvm::dbgs() << "Fusion would violate loop dependences\n");
304  }
305  }
306 
307  // Calculate the number of common loops surrounding 'srcForOp' and 'dstForOp'.
308  unsigned numCommonLoops =
309  affine::getNumCommonSurroundingLoops(*srcForOp, *dstForOp);
310 
311  // Filter out ops in 'opsA' to compute the slice union based on the
312  // assumptions made by the fusion strategy.
313  SmallVector<Operation *, 4> strategyOpsA;
314  switch (fusionStrategy.getStrategy()) {
316  // Generic fusion. Take into account all the memory operations to compute
317  // the slice union.
318  strategyOpsA.append(opsA.begin(), opsA.end());
319  break;
321  // Producer-consumer fusion (AffineLoopFusion pass) only takes into
322  // account stores in 'srcForOp' to compute the slice union.
323  for (Operation *op : opsA) {
324  if (isa<AffineWriteOpInterface>(op))
325  strategyOpsA.push_back(op);
326  }
327  break;
329  // Sibling fusion (AffineLoopFusion pass) only takes into account the loads
330  // to 'memref' in 'srcForOp' to compute the slice union.
331  for (Operation *op : opsA) {
332  auto load = dyn_cast<AffineReadOpInterface>(op);
333  if (load && load.getMemRef() == fusionStrategy.getSiblingFusionMemRef())
334  strategyOpsA.push_back(op);
335  }
336  break;
337  }
338 
339  // Compute union of computation slices computed between all pairs of ops
340  // from 'forOpA' and 'forOpB'.
341  SliceComputationResult sliceComputationResult = affine::computeSliceUnion(
342  strategyOpsA, opsB, dstLoopDepth, numCommonLoops,
343  isSrcForOpBeforeDstForOp, srcSlice);
344  if (sliceComputationResult.value == SliceComputationResult::GenericFailure) {
345  LLVM_DEBUG(llvm::dbgs() << "computeSliceUnion failed\n");
347  }
348  if (sliceComputationResult.value ==
350  LLVM_DEBUG(llvm::dbgs() << "Incorrect slice computation\n");
352  }
353 
354  return FusionResult::Success;
355 }
356 
357 /// Patch the loop body of a forOp that is a single iteration reduction loop
358 /// into its containing block.
360  bool siblingFusionUser) {
361  // Check if the reduction loop is a single iteration loop.
362  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
363  if (!tripCount || *tripCount != 1)
364  return failure();
365  auto *parentOp = forOp->getParentOp();
366  if (!isa<AffineForOp>(parentOp))
367  return failure();
368  SmallVector<Value> newOperands;
369  llvm::append_range(newOperands,
370  forOp.getBody()->getTerminator()->getOperands());
371  IRRewriter rewriter(parentOp->getContext());
372  int64_t parentOpNumResults = parentOp->getNumResults();
373  // Replace the parent loop and add iteroperands and results from the `forOp`.
374  AffineForOp parentForOp = forOp->getParentOfType<AffineForOp>();
375  AffineForOp newLoop =
376  cast<AffineForOp>(*parentForOp.replaceWithAdditionalYields(
377  rewriter, forOp.getInits(), /*replaceInitOperandUsesInLoop=*/false,
378  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
379  return newOperands;
380  }));
381 
382  // For sibling-fusion users, collect operations that use the results of the
383  // `forOp` outside the new parent loop that has absorbed all its iter args
384  // and operands. These operations will be moved later after the results
385  // have been replaced.
386  SetVector<Operation *> forwardSlice;
387  if (siblingFusionUser) {
388  for (unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
389  SetVector<Operation *> tmpForwardSlice;
390  getForwardSlice(forOp.getResult(i), &tmpForwardSlice);
391  forwardSlice.set_union(tmpForwardSlice);
392  }
393  }
394  // Update the results of the `forOp` in the new loop.
395  for (unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
396  forOp.getResult(i).replaceAllUsesWith(
397  newLoop.getResult(i + parentOpNumResults));
398  }
399  // For sibling-fusion users, move operations that use the results of the
400  // `forOp` outside the new parent loop
401  if (siblingFusionUser) {
402  topologicalSort(forwardSlice);
403  for (Operation *op : llvm::reverse(forwardSlice))
404  op->moveAfter(newLoop);
405  }
406  // Replace the induction variable.
407  auto iv = forOp.getInductionVar();
408  iv.replaceAllUsesWith(newLoop.getInductionVar());
409  // Replace the iter args.
410  auto forOpIterArgs = forOp.getRegionIterArgs();
411  for (auto it : llvm::zip(forOpIterArgs, newLoop.getRegionIterArgs().take_back(
412  forOpIterArgs.size()))) {
413  std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
414  }
415  // Move the loop body operations, except for its terminator, to the loop's
416  // containing block.
417  forOp.getBody()->back().erase();
418  auto *parentBlock = forOp->getBlock();
419  parentBlock->getOperations().splice(Block::iterator(forOp),
420  forOp.getBody()->getOperations());
421  forOp.erase();
422  return success();
423 }
424 
425 /// Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point
426 /// and source slice loop bounds specified in 'srcSlice'.
427 void mlir::affine::fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
428  const ComputationSliceState &srcSlice,
429  bool isInnermostSiblingInsertion) {
430  // Clone 'srcForOp' into 'dstForOp' at 'srcSlice->insertPoint'.
431  OpBuilder b(srcSlice.insertPoint->getBlock(), srcSlice.insertPoint);
432  IRMapping mapper;
433  b.clone(*srcForOp, mapper);
434 
435  // Update 'sliceLoopNest' upper and lower bounds from computed 'srcSlice'.
436  SmallVector<AffineForOp, 4> sliceLoops;
437  for (unsigned i = 0, e = srcSlice.ivs.size(); i < e; ++i) {
438  auto loopIV = mapper.lookupOrNull(srcSlice.ivs[i]);
439  if (!loopIV)
440  continue;
441  auto forOp = getForInductionVarOwner(loopIV);
442  sliceLoops.push_back(forOp);
443  if (AffineMap lbMap = srcSlice.lbs[i]) {
444  auto lbOperands = srcSlice.lbOperands[i];
445  canonicalizeMapAndOperands(&lbMap, &lbOperands);
446  forOp.setLowerBound(lbOperands, lbMap);
447  }
448  if (AffineMap ubMap = srcSlice.ubs[i]) {
449  auto ubOperands = srcSlice.ubOperands[i];
450  canonicalizeMapAndOperands(&ubMap, &ubOperands);
451  forOp.setUpperBound(ubOperands, ubMap);
452  }
453  }
454 
455  llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
456  auto srcIsUnitSlice = [&]() {
457  return (buildSliceTripCountMap(srcSlice, &sliceTripCountMap) &&
458  (getSliceIterationCount(sliceTripCountMap) == 1));
459  };
460  // Fix up and if possible, eliminate single iteration loops.
461  for (AffineForOp forOp : sliceLoops) {
463  isInnermostSiblingInsertion && srcIsUnitSlice())
464  // Patch reduction loop - only ones that are sibling-fused with the
465  // destination loop - into the parent loop.
466  (void)promoteSingleIterReductionLoop(forOp, true);
467  else
468  // Promote any single iteration slice loops.
469  (void)promoteIfSingleIteration(forOp);
470  }
471 }
472 
473 /// Collect loop nest statistics (eg. loop trip count and operation count)
474 /// in 'stats' for loop nest rooted at 'forOp'. Returns true on success,
475 /// returns false otherwise.
476 bool mlir::affine::getLoopNestStats(AffineForOp forOpRoot,
477  LoopNestStats *stats) {
478  auto walkResult = forOpRoot.walk([&](AffineForOp forOp) {
479  auto *childForOp = forOp.getOperation();
480  auto *parentForOp = forOp->getParentOp();
481  if (forOp != forOpRoot) {
482  if (!isa<AffineForOp>(parentForOp)) {
483  LLVM_DEBUG(llvm::dbgs() << "Expected parent AffineForOp\n");
484  return WalkResult::interrupt();
485  }
486  // Add mapping to 'forOp' from its parent AffineForOp.
487  stats->loopMap[parentForOp].push_back(forOp);
488  }
489 
490  // Record the number of op operations in the body of 'forOp'.
491  unsigned count = 0;
492  stats->opCountMap[childForOp] = 0;
493  for (auto &op : *forOp.getBody()) {
494  if (!isa<AffineForOp, AffineIfOp>(op))
495  ++count;
496  }
497  stats->opCountMap[childForOp] = count;
498 
499  // Record trip count for 'forOp'. Set flag if trip count is not
500  // constant.
501  std::optional<uint64_t> maybeConstTripCount = getConstantTripCount(forOp);
502  if (!maybeConstTripCount) {
503  // Currently only constant trip count loop nests are supported.
504  LLVM_DEBUG(llvm::dbgs() << "Non-constant trip count unsupported\n");
505  return WalkResult::interrupt();
506  }
507 
508  stats->tripCountMap[childForOp] = *maybeConstTripCount;
509  return WalkResult::advance();
510  });
511  return !walkResult.wasInterrupted();
512 }
513 
514 // Computes the total cost of the loop nest rooted at 'forOp'.
515 // Currently, the total cost is computed by counting the total operation
516 // instance count (i.e. total number of operations in the loop bodyloop
517 // operation count * loop trip count) for the entire loop nest.
518 // If 'tripCountOverrideMap' is non-null, overrides the trip count for loops
519 // specified in the map when computing the total op instance count.
520 // NOTEs: 1) This is used to compute the cost of computation slices, which are
521 // sliced along the iteration dimension, and thus reduce the trip count.
522 // If 'computeCostMap' is non-null, the total op count for forOps specified
523 // in the map is increased (not overridden) by adding the op count from the
524 // map to the existing op count for the for loop. This is done before
525 // multiplying by the loop's trip count, and is used to model the cost of
526 // inserting a sliced loop nest of known cost into the loop's body.
527 // 2) This is also used to compute the cost of fusing a slice of some loop nest
528 // within another loop.
529 static int64_t getComputeCostHelper(
530  Operation *forOp, LoopNestStats &stats,
531  llvm::SmallDenseMap<Operation *, uint64_t, 8> *tripCountOverrideMap,
532  DenseMap<Operation *, int64_t> *computeCostMap) {
533  // 'opCount' is the total number operations in one iteration of 'forOp' body,
534  // minus terminator op which is a no-op.
535  int64_t opCount = stats.opCountMap[forOp] - 1;
536  if (stats.loopMap.count(forOp) > 0) {
537  for (auto childForOp : stats.loopMap[forOp]) {
538  opCount += getComputeCostHelper(childForOp, stats, tripCountOverrideMap,
539  computeCostMap);
540  }
541  }
542  // Add in additional op instances from slice (if specified in map).
543  if (computeCostMap != nullptr) {
544  auto it = computeCostMap->find(forOp);
545  if (it != computeCostMap->end()) {
546  opCount += it->second;
547  }
548  }
549  // Override trip count (if specified in map).
550  int64_t tripCount = stats.tripCountMap[forOp];
551  if (tripCountOverrideMap != nullptr) {
552  auto it = tripCountOverrideMap->find(forOp);
553  if (it != tripCountOverrideMap->end()) {
554  tripCount = it->second;
555  }
556  }
557  // Returns the total number of dynamic instances of operations in loop body.
558  return tripCount * opCount;
559 }
560 
561 /// Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
562 /// Currently, the total cost is computed by counting the total operation
563 /// instance count (i.e. total number of operations in the loop body * loop
564 /// trip count) for the entire loop nest.
565 int64_t mlir::affine::getComputeCost(AffineForOp forOp, LoopNestStats &stats) {
566  return getComputeCostHelper(forOp, stats,
567  /*tripCountOverrideMap=*/nullptr,
568  /*computeCostMap=*/nullptr);
569 }
570 
571 /// Computes and returns in 'computeCost', the total compute cost of fusing the
572 /// 'slice' of the loop nest rooted at 'srcForOp' into 'dstForOp'. Currently,
573 /// the total cost is computed by counting the total operation instance count
574 /// (i.e. total number of operations in the loop body * loop trip count) for
575 /// the entire loop nest.
576 bool mlir::affine::getFusionComputeCost(AffineForOp srcForOp,
577  LoopNestStats &srcStats,
578  AffineForOp dstForOp,
579  LoopNestStats &dstStats,
580  const ComputationSliceState &slice,
581  int64_t *computeCost) {
582  llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
583  DenseMap<Operation *, int64_t> computeCostMap;
584 
585  // Build trip count map for computation slice.
586  if (!buildSliceTripCountMap(slice, &sliceTripCountMap))
587  return false;
588  // Checks whether a store to load forwarding will happen.
589  int64_t sliceIterationCount = getSliceIterationCount(sliceTripCountMap);
590  assert(sliceIterationCount > 0);
591  bool storeLoadFwdGuaranteed = (sliceIterationCount == 1);
592  auto *insertPointParent = slice.insertPoint->getParentOp();
593 
594  // The store and loads to this memref will disappear.
595  // TODO: Add load coalescing to memref data flow opt pass.
596  if (storeLoadFwdGuaranteed) {
597  // Subtract from operation count the loads/store we expect load/store
598  // forwarding to remove.
599  unsigned storeCount = 0;
600  llvm::SmallDenseSet<Value, 4> storeMemrefs;
601  srcForOp.walk([&](Operation *op) {
602  if (auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
603  storeMemrefs.insert(storeOp.getMemRef());
604  ++storeCount;
605  }
606  });
607  // Subtract out any store ops in single-iteration src slice loop nest.
608  if (storeCount > 0)
609  computeCostMap[insertPointParent] = -storeCount;
610  // Subtract out any load users of 'storeMemrefs' nested below
611  // 'insertPointParent'.
612  for (Value memref : storeMemrefs) {
613  for (auto *user : memref.getUsers()) {
614  if (dyn_cast<AffineReadOpInterface>(user)) {
616  // Check if any loop in loop nest surrounding 'user' is
617  // 'insertPointParent'.
618  getAffineForIVs(*user, &loops);
619  if (llvm::is_contained(loops, cast<AffineForOp>(insertPointParent))) {
620  if (auto forOp =
621  dyn_cast_or_null<AffineForOp>(user->getParentOp())) {
622  if (computeCostMap.count(forOp) == 0)
623  computeCostMap[forOp] = 0;
624  computeCostMap[forOp] -= 1;
625  }
626  }
627  }
628  }
629  }
630  }
631 
632  // Compute op instance count for the src loop nest with iteration slicing.
633  int64_t sliceComputeCost = getComputeCostHelper(
634  srcForOp, srcStats, &sliceTripCountMap, &computeCostMap);
635 
636  // Compute cost of fusion for this depth.
637  computeCostMap[insertPointParent] = sliceComputeCost;
638 
639  *computeCost =
640  getComputeCostHelper(dstForOp, dstStats,
641  /*tripCountOverrideMap=*/nullptr, &computeCostMap);
642  return true;
643 }
644 
645 /// Returns in 'producerConsumerMemrefs' the memrefs involved in a
646 /// producer-consumer dependence between write ops in 'srcOps' and read ops in
647 /// 'dstOps'.
650  DenseSet<Value> &producerConsumerMemrefs) {
651  // Gather memrefs from stores in 'srcOps'.
652  DenseSet<Value> srcStoreMemRefs;
653  for (Operation *op : srcOps)
654  if (auto storeOp = dyn_cast<AffineWriteOpInterface>(op))
655  srcStoreMemRefs.insert(storeOp.getMemRef());
656 
657  // Compute the intersection between memrefs from stores in 'srcOps' and
658  // memrefs from loads in 'dstOps'.
659  for (Operation *op : dstOps)
660  if (auto loadOp = dyn_cast<AffineReadOpInterface>(op))
661  if (srcStoreMemRefs.count(loadOp.getMemRef()) > 0)
662  producerConsumerMemrefs.insert(loadOp.getMemRef());
663 }
static void getLoadAndStoreMemRefAccesses(Operation *opA, DenseMap< Value, bool > &values)
static bool gatherLoadsAndStores(AffineForOp forOp, SmallVectorImpl< Operation * > &loadAndStoreOps)
static Operation * getLastDependentOpInRange(Operation *opA, Operation *opB)
static int64_t getComputeCostHelper(Operation *forOp, LoopNestStats &stats, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountOverrideMap, DenseMap< Operation *, int64_t > *computeCostMap)
static Operation * getFirstDependentOpInRange(Operation *opA, Operation *opB)
static bool isDependentLoadOrStoreOp(Operation *op, DenseMap< Value, bool > &values)
Returns true if 'op' is a load or store operation which access a memref accessed 'values' and at leas...
static LogicalResult promoteSingleIterReductionLoop(AffineForOp forOp, bool siblingFusionUser)
Patch the loop body of a forOp that is a single iteration reduction loop into its containing block.
static Operation * getFusedLoopNestInsertionPoint(AffineForOp srcForOp, AffineForOp dstForOp)
static unsigned getMaxLoopDepth(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps)
Returns the maximum loop depth at which we could fuse producer loop 'srcForOp' into consumer loop 'ds...
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:44
OpListType::iterator iterator
Definition: Block.h:133
OpListType::reverse_iterator reverse_iterator
Definition: Block.h:134
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookupOrNull(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:58
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:710
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
This class helps build Operations.
Definition: Builders.h:206
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:528
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
Definition: Operation.cpp:385
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:776
user_range getUsers()
Returns a range of all users.
Definition: Operation.h:852
result_range getResults()
Definition: Operation.h:410
void moveAfter(Operation *existingOp)
Unlink this operation from its current block and insert it right after existingOp which may be in the...
Definition: Operation.cpp:568
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:93
static WalkResult advance()
Definition: Visitors.h:52
static WalkResult interrupt()
Definition: Visitors.h:51
Describes the fusion strategy to be used in the Affine loop fusion utilities.
StrategyEnum getStrategy() const
Returns the fusion strategy.
Value getSiblingFusionMemRef() const
Returns the memref attached to this sibling fusion strategy.
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:132
bool getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats, AffineForOp dstForOp, LoopNestStats &dstStats, const ComputationSliceState &slice, int64_t *computeCost)
Computes and returns in 'computeCost', the total compute cost of fusing the 'slice' of the loop nest ...
SliceComputationResult computeSliceUnion(ArrayRef< Operation * > opsA, ArrayRef< Operation * > opsB, unsigned loopDepth, unsigned numCommonLoops, bool isBackwardSlice, ComputationSliceState *sliceUnion)
Computes in 'sliceUnion' the union of all slice bounds computed at 'loopDepth' between all dependent ...
Definition: Utils.cpp:1289
bool isLoopParallelAndContainsReduction(AffineForOp forOp)
Returns whether a loop is a parallel loop and contains a reduction loop.
Definition: Utils.cpp:1834
unsigned getNumCommonSurroundingLoops(Operation &a, Operation &b)
Returns the number of surrounding loops common to both A and B.
Definition: Utils.cpp:1765
void gatherProducerConsumerMemrefs(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps, DenseSet< Value > &producerConsumerMemrefs)
Returns in 'producerConsumerMemrefs' the memrefs involved in a producer-consumer dependence between w...
AffineForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
Definition: AffineOps.cpp:2632
DependenceResult checkMemrefAccessDependence(const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, unsigned loopDepth, FlatAffineValueConstraints *dependenceConstraints=nullptr, SmallVector< DependenceComponent, 2 > *dependenceComponents=nullptr, bool allowRAR=false)
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1514
int64_t getComputeCost(AffineForOp forOp, LoopNestStats &stats)
Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
void fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, const ComputationSliceState &srcSlice, bool isInnermostSiblingInsertionFusion=false)
Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point and source slice loop bo...
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:505
unsigned getInnermostCommonLoopDepth(ArrayRef< Operation * > ops, SmallVectorImpl< AffineForOp > *surroundingLoops=nullptr)
Returns the innermost common loop depth for the set of operations in 'ops'.
Definition: Utils.cpp:1257
bool getLoopNestStats(AffineForOp forOp, LoopNestStats *stats)
Collect loop nest statistics (eg.
bool hasDependence(DependenceResult result)
Utility function that returns true if the provided DependenceResult corresponds to a dependence resul...
uint64_t getSliceIterationCount(const llvm::SmallDenseMap< Operation *, uint64_t, 8 > &sliceTripCountMap)
Return the number of iterations for the slicetripCountMap provided.
Definition: Utils.cpp:1511
FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, unsigned dstLoopDepth, ComputationSliceState *srcSlice, FusionStrategy fusionStrategy=FusionStrategy::Generic)
Checks the feasibility of fusing the loop nest rooted at 'srcForOp' into the loop nest rooted at 'dst...
bool buildSliceTripCountMap(const ComputationSliceState &slice, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountMap)
Builds a map 'tripCountMap' from AffineForOp to constant trip count for loop nest surrounding represe...
Definition: Utils.cpp:1473
This header declares functions that assist transformations in the MemRef dialect.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
void getForwardSlice(Operation *op, SetVector< Operation * > *forwardSlice, ForwardSliceOptions options={})
Fills forwardSlice with the computed forward slice (i.e.
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
SetVector< Operation * > topologicalSort(const SetVector< Operation * > &toSort)
Multi-root DAG topological sort.
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
ComputationSliceState aggregates loop IVs, loop bound AffineMaps and their associated operands for a ...
Definition: Utils.h:260
SmallVector< Value, 4 > ivs
Definition: Utils.h:263
std::vector< SmallVector< Value, 4 > > ubOperands
Definition: Utils.h:271
SmallVector< AffineMap, 4 > ubs
Definition: Utils.h:267
SmallVector< AffineMap, 4 > lbs
Definition: Utils.h:265
std::vector< SmallVector< Value, 4 > > lbOperands
Definition: Utils.h:269
Checks whether two accesses to the same memref access the same element.
LoopNestStats aggregates various per-loop statistics (eg.
DenseMap< Operation *, uint64_t > opCountMap
Map from AffineForOp to count of operations in its loop body.
DenseMap< Operation *, SmallVector< AffineForOp, 2 > > loopMap
Map from AffineForOp to immediate child AffineForOps in its loop body.
DenseMap< Operation *, uint64_t > tripCountMap
Map from AffineForOp to its constant trip count.
Encapsulates a memref load or store access information.
Enumerates different result statuses of slice computation by computeSliceUnion
Definition: Utils.h:247
enum mlir::affine::SliceComputationResult::ResultEnum value
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.