MLIR  21.0.0git
LoopFusionUtils.cpp
Go to the documentation of this file.
1 //===- LoopFusionUtils.cpp ---- Utilities for loop fusion ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop fusion transformation utility functions.
10 //
11 //===----------------------------------------------------------------------===//
12 
21 #include "mlir/IR/IRMapping.h"
22 #include "mlir/IR/Operation.h"
23 #include "mlir/IR/PatternMatch.h"
24 #include "llvm/Support/Debug.h"
25 #include "llvm/Support/raw_ostream.h"
26 #include <optional>
27 
28 #define DEBUG_TYPE "affine-fusion-utils"
29 
30 using namespace mlir;
31 using namespace mlir::affine;
32 
33 // Gathers all load and store memref accesses in 'opA' into 'values', where
34 // 'values[memref] == true' for each store operation.
36  DenseMap<Value, bool> &values) {
37  opA->walk([&](Operation *op) {
38  if (auto loadOp = dyn_cast<AffineReadOpInterface>(op)) {
39  if (values.count(loadOp.getMemRef()) == 0)
40  values[loadOp.getMemRef()] = false;
41  } else if (auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
42  values[storeOp.getMemRef()] = true;
43  }
44  });
45 }
46 
47 /// Returns true if 'op' is a load or store operation which access a memref
48 /// accessed 'values' and at least one of the access is a store operation.
49 /// Returns false otherwise.
51  DenseMap<Value, bool> &values) {
52  if (auto loadOp = dyn_cast<AffineReadOpInterface>(op))
53  return values.count(loadOp.getMemRef()) > 0 && values[loadOp.getMemRef()];
54  if (auto storeOp = dyn_cast<AffineWriteOpInterface>(op))
55  return values.count(storeOp.getMemRef()) > 0;
56  return false;
57 }
58 
59 // Returns the first operation in range ('opA', 'opB') which has a data
60 // dependence on 'opA'. Returns 'nullptr' of no dependence exists.
62  // Record memref values from all loads/store in loop nest rooted at 'opA'.
63  // Map from memref value to bool which is true if store, false otherwise.
64  DenseMap<Value, bool> values;
65  getLoadAndStoreMemRefAccesses(opA, values);
66 
67  // For each 'opX' in block in range ('opA', 'opB'), check if there is a data
68  // dependence from 'opA' to 'opX' ('opA' and 'opX' access the same memref
69  // and at least one of the accesses is a store).
70  Operation *firstDepOp = nullptr;
71  for (Block::iterator it = std::next(Block::iterator(opA));
72  it != Block::iterator(opB); ++it) {
73  Operation *opX = &(*it);
74  opX->walk([&](Operation *op) {
75  if (!firstDepOp && isDependentLoadOrStoreOp(op, values))
76  firstDepOp = opX;
77  });
78  if (firstDepOp)
79  break;
80  }
81  return firstDepOp;
82 }
83 
84 // Returns the last operation 'opX' in range ('opA', 'opB'), for which there
85 // exists a data dependence from 'opX' to 'opB'.
86 // Returns 'nullptr' of no dependence exists.
88  // Record memref values from all loads/store in loop nest rooted at 'opB'.
89  // Map from memref value to bool which is true if store, false otherwise.
90  DenseMap<Value, bool> values;
91  getLoadAndStoreMemRefAccesses(opB, values);
92 
93  // For each 'opX' in block in range ('opA', 'opB') in reverse order,
94  // check if there is a data dependence from 'opX' to 'opB':
95  // *) 'opX' and 'opB' access the same memref and at least one of the accesses
96  // is a store.
97  // *) 'opX' produces an SSA Value which is used by 'opB'.
98  Operation *lastDepOp = nullptr;
99  for (Block::reverse_iterator it = std::next(Block::reverse_iterator(opB));
100  it != Block::reverse_iterator(opA); ++it) {
101  Operation *opX = &(*it);
102  opX->walk([&](Operation *op) {
103  if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
104  if (isDependentLoadOrStoreOp(op, values)) {
105  lastDepOp = opX;
106  return WalkResult::interrupt();
107  }
108  return WalkResult::advance();
109  }
110  for (Value value : op->getResults()) {
111  for (Operation *user : value.getUsers()) {
113  // Check if any loop in loop nest surrounding 'user' is 'opB'.
114  getAffineForIVs(*user, &loops);
115  if (llvm::is_contained(loops, cast<AffineForOp>(opB))) {
116  lastDepOp = opX;
117  return WalkResult::interrupt();
118  }
119  }
120  }
121  return WalkResult::advance();
122  });
123  if (lastDepOp)
124  break;
125  }
126  return lastDepOp;
127 }
128 
129 // Computes and returns an insertion point operation, before which the
130 // the fused <srcForOp, dstForOp> loop nest can be inserted while preserving
131 // dependences. Returns nullptr if no such insertion point is found.
132 static Operation *getFusedLoopNestInsertionPoint(AffineForOp srcForOp,
133  AffineForOp dstForOp) {
134  bool isSrcForOpBeforeDstForOp = srcForOp->isBeforeInBlock(dstForOp);
135  auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
136  auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
137 
138  Operation *firstDepOpA = getFirstDependentOpInRange(forOpA, forOpB);
139  Operation *lastDepOpB = getLastDependentOpInRange(forOpA, forOpB);
140  // Block:
141  // ...
142  // |-- opA
143  // | ...
144  // | lastDepOpB --|
145  // | ... |
146  // |-> firstDepOpA |
147  // ... |
148  // opB <---------
149  //
150  // Valid insertion point range: (lastDepOpB, firstDepOpA)
151  //
152  if (firstDepOpA) {
153  if (lastDepOpB) {
154  if (firstDepOpA->isBeforeInBlock(lastDepOpB) || firstDepOpA == lastDepOpB)
155  // No valid insertion point exists which preserves dependences.
156  return nullptr;
157  }
158  // Return insertion point in valid range closest to 'opB'.
159  // TODO: Consider other insertion points in valid range.
160  return firstDepOpA;
161  }
162  // No dependences from 'opA' to operation in range ('opA', 'opB'), return
163  // 'opB' insertion point.
164  return forOpB;
165 }
166 
167 // Gathers all load and store ops in loop nest rooted at 'forOp' into
168 // 'loadAndStoreOps'.
169 static bool
170 gatherLoadsAndStores(AffineForOp forOp,
171  SmallVectorImpl<Operation *> &loadAndStoreOps) {
172  bool hasIfOp = false;
173  forOp.walk([&](Operation *op) {
174  if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
175  loadAndStoreOps.push_back(op);
176  else if (isa<AffineIfOp>(op))
177  hasIfOp = true;
178  });
179  return !hasIfOp;
180 }
181 
182 /// Returns the maximum loop depth at which we could fuse producer loop
183 /// 'srcForOp' into consumer loop 'dstForOp' without violating data dependences.
184 // TODO: Generalize this check for sibling and more generic fusion scenarios.
185 // TODO: Support forward slice fusion.
186 static unsigned getMaxLoopDepth(ArrayRef<Operation *> srcOps,
187  ArrayRef<Operation *> dstOps) {
188  if (dstOps.empty())
189  // Expected at least one memory operation.
190  // TODO: Revisit this case with a specific example.
191  return 0;
192 
193  // Filter out ops in 'dstOps' that do not use the producer-consumer memref so
194  // that they are not considered for analysis.
195  DenseSet<Value> producerConsumerMemrefs;
196  gatherProducerConsumerMemrefs(srcOps, dstOps, producerConsumerMemrefs);
197  SmallVector<Operation *, 4> targetDstOps;
198  for (Operation *dstOp : dstOps) {
199  auto loadOp = dyn_cast<AffineReadOpInterface>(dstOp);
200  Value memref = loadOp ? loadOp.getMemRef()
201  : cast<AffineWriteOpInterface>(dstOp).getMemRef();
202  if (producerConsumerMemrefs.count(memref) > 0)
203  targetDstOps.push_back(dstOp);
204  }
205 
206  assert(!targetDstOps.empty() &&
207  "No dependences between 'srcForOp' and 'dstForOp'?");
208 
209  // Compute the innermost common loop depth for loads and stores.
210  unsigned loopDepth = getInnermostCommonLoopDepth(targetDstOps);
211 
212  // Return common loop depth for loads if there are no store ops.
213  if (all_of(targetDstOps, llvm::IsaPred<AffineReadOpInterface>))
214  return loopDepth;
215 
216  // Check dependences on all pairs of ops in 'targetDstOps' and store the
217  // minimum loop depth at which a dependence is satisfied.
218  for (unsigned i = 0, e = targetDstOps.size(); i < e; ++i) {
219  Operation *srcOpInst = targetDstOps[i];
220  MemRefAccess srcAccess(srcOpInst);
221  for (unsigned j = 0; j < e; ++j) {
222  auto *dstOpInst = targetDstOps[j];
223  MemRefAccess dstAccess(dstOpInst);
224 
225  unsigned numCommonLoops =
226  getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst);
227  for (unsigned d = 1; d <= numCommonLoops + 1; ++d) {
228  // TODO: Cache dependence analysis results, check cache here.
229  DependenceResult result =
230  checkMemrefAccessDependence(srcAccess, dstAccess, d);
231  if (hasDependence(result)) {
232  // Store minimum loop depth and break because we want the min 'd' at
233  // which there is a dependence.
234  loopDepth = std::min(loopDepth, d - 1);
235  break;
236  }
237  }
238  }
239  }
240 
241  return loopDepth;
242 }
243 
244 // TODO: This pass performs some computation that is the same for all the depths
245 // (e.g., getMaxLoopDepth). Implement a version of this utility that processes
246 // all the depths at once or only the legal maximal depth for maximal fusion.
248  AffineForOp dstForOp,
249  unsigned dstLoopDepth,
250  ComputationSliceState *srcSlice,
251  FusionStrategy fusionStrategy) {
252  // Return 'failure' if 'dstLoopDepth == 0'.
253  if (dstLoopDepth == 0) {
254  LLVM_DEBUG(llvm::dbgs() << "Cannot fuse loop nests at depth 0\n");
256  }
257  // Return 'failure' if 'srcForOp' and 'dstForOp' are not in the same block.
258  auto *block = srcForOp->getBlock();
259  if (block != dstForOp->getBlock()) {
260  LLVM_DEBUG(llvm::dbgs() << "Cannot fuse loop nests in different blocks\n");
262  }
263 
264  // Return 'failure' if no valid insertion point for fused loop nest in 'block'
265  // exists which would preserve dependences.
266  if (!getFusedLoopNestInsertionPoint(srcForOp, dstForOp)) {
267  LLVM_DEBUG(llvm::dbgs() << "Fusion would violate dependences in block\n");
269  }
270 
271  // Check if 'srcForOp' precedes 'dstForOp' in 'block'.
272  bool isSrcForOpBeforeDstForOp = srcForOp->isBeforeInBlock(dstForOp);
273  // 'forOpA' executes before 'forOpB' in 'block'.
274  auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
275  auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
276 
277  // Gather all load and store from 'forOpA' which precedes 'forOpB' in 'block'.
279  if (!gatherLoadsAndStores(forOpA, opsA)) {
280  LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported\n");
282  }
283 
284  // Gather all load and store from 'forOpB' which succeeds 'forOpA' in 'block'.
286  if (!gatherLoadsAndStores(forOpB, opsB)) {
287  LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported\n");
289  }
290 
291  // Return 'failure' if fusing loops at depth 'dstLoopDepth' wouldn't preserve
292  // loop dependences.
293  // TODO: Enable this check for sibling and more generic loop fusion
294  // strategies.
295  if (fusionStrategy.getStrategy() == FusionStrategy::ProducerConsumer) {
296  // TODO: 'getMaxLoopDepth' does not support forward slice fusion.
297  assert(isSrcForOpBeforeDstForOp && "Unexpected forward slice fusion");
298  if (getMaxLoopDepth(opsA, opsB) < dstLoopDepth) {
299  LLVM_DEBUG(llvm::dbgs() << "Fusion would violate loop dependences\n");
301  }
302  }
303 
304  // Calculate the number of common loops surrounding 'srcForOp' and 'dstForOp'.
305  unsigned numCommonLoops =
306  affine::getNumCommonSurroundingLoops(*srcForOp, *dstForOp);
307 
308  // Filter out ops in 'opsA' to compute the slice union based on the
309  // assumptions made by the fusion strategy.
310  SmallVector<Operation *, 4> strategyOpsA;
311  switch (fusionStrategy.getStrategy()) {
313  // Generic fusion. Take into account all the memory operations to compute
314  // the slice union.
315  strategyOpsA.append(opsA.begin(), opsA.end());
316  break;
318  // Producer-consumer fusion (AffineLoopFusion pass) only takes into
319  // account stores in 'srcForOp' to compute the slice union.
320  for (Operation *op : opsA) {
321  if (isa<AffineWriteOpInterface>(op))
322  strategyOpsA.push_back(op);
323  }
324  break;
326  // Sibling fusion (AffineLoopFusion pass) only takes into account the loads
327  // to 'memref' in 'srcForOp' to compute the slice union.
328  for (Operation *op : opsA) {
329  auto load = dyn_cast<AffineReadOpInterface>(op);
330  if (load && load.getMemRef() == fusionStrategy.getSiblingFusionMemRef())
331  strategyOpsA.push_back(op);
332  }
333  break;
334  }
335 
336  // Compute union of computation slices computed between all pairs of ops
337  // from 'forOpA' and 'forOpB'.
338  SliceComputationResult sliceComputationResult = affine::computeSliceUnion(
339  strategyOpsA, opsB, dstLoopDepth, numCommonLoops,
340  isSrcForOpBeforeDstForOp, srcSlice);
341  if (sliceComputationResult.value == SliceComputationResult::GenericFailure) {
342  LLVM_DEBUG(llvm::dbgs() << "computeSliceUnion failed\n");
344  }
345  if (sliceComputationResult.value ==
347  LLVM_DEBUG(llvm::dbgs() << "Incorrect slice computation\n");
349  }
350 
351  return FusionResult::Success;
352 }
353 
354 /// Patch the loop body of a forOp that is a single iteration reduction loop
355 /// into its containing block.
356 static LogicalResult promoteSingleIterReductionLoop(AffineForOp forOp,
357  bool siblingFusionUser) {
358  // Check if the reduction loop is a single iteration loop.
359  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
360  if (!tripCount || *tripCount != 1)
361  return failure();
362  auto *parentOp = forOp->getParentOp();
363  if (!isa<AffineForOp>(parentOp))
364  return failure();
365  SmallVector<Value> newOperands;
366  llvm::append_range(newOperands,
367  forOp.getBody()->getTerminator()->getOperands());
368  IRRewriter rewriter(parentOp->getContext());
369  int64_t parentOpNumResults = parentOp->getNumResults();
370  // Replace the parent loop and add iteroperands and results from the `forOp`.
371  AffineForOp parentForOp = forOp->getParentOfType<AffineForOp>();
372  AffineForOp newLoop =
373  cast<AffineForOp>(*parentForOp.replaceWithAdditionalYields(
374  rewriter, forOp.getInits(), /*replaceInitOperandUsesInLoop=*/false,
375  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
376  return newOperands;
377  }));
378 
379  // For sibling-fusion users, collect operations that use the results of the
380  // `forOp` outside the new parent loop that has absorbed all its iter args
381  // and operands. These operations will be moved later after the results
382  // have been replaced.
383  SetVector<Operation *> forwardSlice;
384  if (siblingFusionUser) {
385  for (unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
386  SetVector<Operation *> tmpForwardSlice;
387  getForwardSlice(forOp.getResult(i), &tmpForwardSlice);
388  forwardSlice.set_union(tmpForwardSlice);
389  }
390  }
391  // Update the results of the `forOp` in the new loop.
392  for (unsigned i = 0, e = forOp.getNumResults(); i != e; ++i) {
393  forOp.getResult(i).replaceAllUsesWith(
394  newLoop.getResult(i + parentOpNumResults));
395  }
396  // For sibling-fusion users, move operations that use the results of the
397  // `forOp` outside the new parent loop
398  if (siblingFusionUser) {
399  topologicalSort(forwardSlice);
400  for (Operation *op : llvm::reverse(forwardSlice))
401  op->moveAfter(newLoop);
402  }
403  // Replace the induction variable.
404  auto iv = forOp.getInductionVar();
405  iv.replaceAllUsesWith(newLoop.getInductionVar());
406  // Replace the iter args.
407  auto forOpIterArgs = forOp.getRegionIterArgs();
408  for (auto it : llvm::zip(forOpIterArgs, newLoop.getRegionIterArgs().take_back(
409  forOpIterArgs.size()))) {
410  std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
411  }
412  // Move the loop body operations, except for its terminator, to the loop's
413  // containing block.
414  forOp.getBody()->back().erase();
415  auto *parentBlock = forOp->getBlock();
416  parentBlock->getOperations().splice(Block::iterator(forOp),
417  forOp.getBody()->getOperations());
418  forOp.erase();
419  return success();
420 }
421 
422 /// Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point
423 /// and source slice loop bounds specified in 'srcSlice'.
424 void mlir::affine::fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
425  const ComputationSliceState &srcSlice,
426  bool isInnermostSiblingInsertion) {
427  // Clone 'srcForOp' into 'dstForOp' at 'srcSlice->insertPoint'.
428  OpBuilder b(srcSlice.insertPoint->getBlock(), srcSlice.insertPoint);
429  IRMapping mapper;
430  b.clone(*srcForOp, mapper);
431 
432  // Update 'sliceLoopNest' upper and lower bounds from computed 'srcSlice'.
433  SmallVector<AffineForOp, 4> sliceLoops;
434  for (unsigned i = 0, e = srcSlice.ivs.size(); i < e; ++i) {
435  auto loopIV = mapper.lookupOrNull(srcSlice.ivs[i]);
436  if (!loopIV)
437  continue;
438  auto forOp = getForInductionVarOwner(loopIV);
439  sliceLoops.push_back(forOp);
440  if (AffineMap lbMap = srcSlice.lbs[i]) {
441  auto lbOperands = srcSlice.lbOperands[i];
442  canonicalizeMapAndOperands(&lbMap, &lbOperands);
443  forOp.setLowerBound(lbOperands, lbMap);
444  }
445  if (AffineMap ubMap = srcSlice.ubs[i]) {
446  auto ubOperands = srcSlice.ubOperands[i];
447  canonicalizeMapAndOperands(&ubMap, &ubOperands);
448  forOp.setUpperBound(ubOperands, ubMap);
449  }
450  }
451 
452  llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
453  auto srcIsUnitSlice = [&]() {
454  return (buildSliceTripCountMap(srcSlice, &sliceTripCountMap) &&
455  (getSliceIterationCount(sliceTripCountMap) == 1));
456  };
457  // Fix up and if possible, eliminate single iteration loops.
458  for (AffineForOp forOp : sliceLoops) {
460  isInnermostSiblingInsertion && srcIsUnitSlice())
461  // Patch reduction loop - only ones that are sibling-fused with the
462  // destination loop - into the parent loop.
463  (void)promoteSingleIterReductionLoop(forOp, true);
464  else
465  // Promote any single iteration slice loops.
466  (void)promoteIfSingleIteration(forOp);
467  }
468 }
469 
470 /// Collect loop nest statistics (eg. loop trip count and operation count)
471 /// in 'stats' for loop nest rooted at 'forOp'. Returns true on success,
472 /// returns false otherwise.
473 bool mlir::affine::getLoopNestStats(AffineForOp forOpRoot,
474  LoopNestStats *stats) {
475  auto walkResult = forOpRoot.walk([&](AffineForOp forOp) {
476  auto *childForOp = forOp.getOperation();
477  auto *parentForOp = forOp->getParentOp();
478  if (forOp != forOpRoot) {
479  if (!isa<AffineForOp>(parentForOp)) {
480  LLVM_DEBUG(llvm::dbgs() << "Expected parent AffineForOp\n");
481  return WalkResult::interrupt();
482  }
483  // Add mapping to 'forOp' from its parent AffineForOp.
484  stats->loopMap[parentForOp].push_back(forOp);
485  }
486 
487  // Record the number of op operations in the body of 'forOp'.
488  unsigned count = 0;
489  stats->opCountMap[childForOp] = 0;
490  for (auto &op : *forOp.getBody()) {
491  if (!isa<AffineForOp, AffineIfOp>(op))
492  ++count;
493  }
494  stats->opCountMap[childForOp] = count;
495 
496  // Record trip count for 'forOp'. Set flag if trip count is not
497  // constant.
498  std::optional<uint64_t> maybeConstTripCount = getConstantTripCount(forOp);
499  if (!maybeConstTripCount) {
500  // Currently only constant trip count loop nests are supported.
501  LLVM_DEBUG(llvm::dbgs() << "Non-constant trip count unsupported\n");
502  return WalkResult::interrupt();
503  }
504 
505  stats->tripCountMap[childForOp] = *maybeConstTripCount;
506  return WalkResult::advance();
507  });
508  return !walkResult.wasInterrupted();
509 }
510 
511 // Computes the total cost of the loop nest rooted at 'forOp'.
512 // Currently, the total cost is computed by counting the total operation
513 // instance count (i.e. total number of operations in the loop bodyloop
514 // operation count * loop trip count) for the entire loop nest.
515 // If 'tripCountOverrideMap' is non-null, overrides the trip count for loops
516 // specified in the map when computing the total op instance count.
517 // NOTEs: 1) This is used to compute the cost of computation slices, which are
518 // sliced along the iteration dimension, and thus reduce the trip count.
519 // If 'computeCostMap' is non-null, the total op count for forOps specified
520 // in the map is increased (not overridden) by adding the op count from the
521 // map to the existing op count for the for loop. This is done before
522 // multiplying by the loop's trip count, and is used to model the cost of
523 // inserting a sliced loop nest of known cost into the loop's body.
524 // 2) This is also used to compute the cost of fusing a slice of some loop nest
525 // within another loop.
526 static int64_t getComputeCostHelper(
527  Operation *forOp, LoopNestStats &stats,
528  llvm::SmallDenseMap<Operation *, uint64_t, 8> *tripCountOverrideMap,
529  DenseMap<Operation *, int64_t> *computeCostMap) {
530  // 'opCount' is the total number operations in one iteration of 'forOp' body,
531  // minus terminator op which is a no-op.
532  int64_t opCount = stats.opCountMap[forOp] - 1;
533  if (stats.loopMap.count(forOp) > 0) {
534  for (auto childForOp : stats.loopMap[forOp]) {
535  opCount += getComputeCostHelper(childForOp, stats, tripCountOverrideMap,
536  computeCostMap);
537  }
538  }
539  // Add in additional op instances from slice (if specified in map).
540  if (computeCostMap) {
541  auto it = computeCostMap->find(forOp);
542  if (it != computeCostMap->end()) {
543  opCount += it->second;
544  }
545  }
546  // Override trip count (if specified in map).
547  int64_t tripCount = stats.tripCountMap[forOp];
548  if (tripCountOverrideMap) {
549  auto it = tripCountOverrideMap->find(forOp);
550  if (it != tripCountOverrideMap->end()) {
551  tripCount = it->second;
552  }
553  }
554  // Returns the total number of dynamic instances of operations in loop body.
555  return tripCount * opCount;
556 }
557 
558 /// Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
559 /// Currently, the total cost is computed by counting the total operation
560 /// instance count (i.e. total number of operations in the loop body * loop
561 /// trip count) for the entire loop nest.
562 int64_t mlir::affine::getComputeCost(AffineForOp forOp, LoopNestStats &stats) {
563  return getComputeCostHelper(forOp, stats,
564  /*tripCountOverrideMap=*/nullptr,
565  /*computeCostMap=*/nullptr);
566 }
567 
568 /// Computes and returns in 'computeCost', the total compute cost of fusing the
569 /// 'slice' of the loop nest rooted at 'srcForOp' into 'dstForOp'. Currently,
570 /// the total cost is computed by counting the total operation instance count
571 /// (i.e. total number of operations in the loop body * loop trip count) for
572 /// the entire loop nest.
573 bool mlir::affine::getFusionComputeCost(AffineForOp srcForOp,
574  LoopNestStats &srcStats,
575  AffineForOp dstForOp,
576  LoopNestStats &dstStats,
577  const ComputationSliceState &slice,
578  int64_t *computeCost) {
579  llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
580  DenseMap<Operation *, int64_t> computeCostMap;
581 
582  // Build trip count map for computation slice.
583  if (!buildSliceTripCountMap(slice, &sliceTripCountMap))
584  return false;
585  // Checks whether a store to load forwarding will happen.
586  int64_t sliceIterationCount = getSliceIterationCount(sliceTripCountMap);
587  assert(sliceIterationCount > 0);
588  bool storeLoadFwdGuaranteed = (sliceIterationCount == 1);
589  auto *insertPointParent = slice.insertPoint->getParentOp();
590 
591  // The store and loads to this memref will disappear.
592  if (storeLoadFwdGuaranteed) {
593  // Subtract from operation count the loads/store we expect load/store
594  // forwarding to remove.
595  unsigned storeCount = 0;
596  llvm::SmallDenseSet<Value, 4> storeMemrefs;
597  srcForOp.walk([&](AffineWriteOpInterface storeOp) {
598  storeMemrefs.insert(storeOp.getMemRef());
599  ++storeCount;
600  });
601  // Subtract out any store ops in single-iteration src slice loop nest.
602  if (storeCount > 0)
603  computeCostMap[insertPointParent] = -storeCount;
604  // Subtract out any load users of 'storeMemrefs' nested below
605  // 'insertPointParent'.
606  for (Value memref : storeMemrefs) {
607  for (Operation *user : memref.getUsers()) {
608  if (!isa<AffineReadOpInterface>(user))
609  continue;
611  // Check if any loop in loop nest surrounding 'user' is
612  // 'insertPointParent'.
613  getAffineForIVs(*user, &loops);
614  if (llvm::is_contained(loops, cast<AffineForOp>(insertPointParent))) {
615  if (auto forOp = dyn_cast_or_null<AffineForOp>(user->getParentOp())) {
616  if (computeCostMap.count(forOp) == 0)
617  computeCostMap[forOp] = 0;
618  computeCostMap[forOp] -= 1;
619  }
620  }
621  }
622  }
623  }
624 
625  // Compute op instance count for the src loop nest with iteration slicing.
626  int64_t sliceComputeCost = getComputeCostHelper(
627  srcForOp, srcStats, &sliceTripCountMap, &computeCostMap);
628 
629  // Compute cost of fusion for this depth.
630  computeCostMap[insertPointParent] = sliceComputeCost;
631 
632  *computeCost =
633  getComputeCostHelper(dstForOp, dstStats,
634  /*tripCountOverrideMap=*/nullptr, &computeCostMap);
635  return true;
636 }
637 
638 /// Returns in 'producerConsumerMemrefs' the memrefs involved in a
639 /// producer-consumer dependence between write ops in 'srcOps' and read ops in
640 /// 'dstOps'.
643  DenseSet<Value> &producerConsumerMemrefs) {
644  // Gather memrefs from stores in 'srcOps'.
645  DenseSet<Value> srcStoreMemRefs;
646  for (Operation *op : srcOps)
647  if (auto storeOp = dyn_cast<AffineWriteOpInterface>(op))
648  srcStoreMemRefs.insert(storeOp.getMemRef());
649 
650  // Compute the intersection between memrefs from stores in 'srcOps' and
651  // memrefs from loads in 'dstOps'.
652  for (Operation *op : dstOps)
653  if (auto loadOp = dyn_cast<AffineReadOpInterface>(op))
654  if (srcStoreMemRefs.count(loadOp.getMemRef()) > 0)
655  producerConsumerMemrefs.insert(loadOp.getMemRef());
656 }
static void getLoadAndStoreMemRefAccesses(Operation *opA, DenseMap< Value, bool > &values)
static bool gatherLoadsAndStores(AffineForOp forOp, SmallVectorImpl< Operation * > &loadAndStoreOps)
static Operation * getLastDependentOpInRange(Operation *opA, Operation *opB)
static int64_t getComputeCostHelper(Operation *forOp, LoopNestStats &stats, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountOverrideMap, DenseMap< Operation *, int64_t > *computeCostMap)
static Operation * getFirstDependentOpInRange(Operation *opA, Operation *opB)
static bool isDependentLoadOrStoreOp(Operation *op, DenseMap< Value, bool > &values)
Returns true if 'op' is a load or store operation which access a memref accessed 'values' and at leas...
static LogicalResult promoteSingleIterReductionLoop(AffineForOp forOp, bool siblingFusionUser)
Patch the loop body of a forOp that is a single iteration reduction loop into its containing block.
static Operation * getFusedLoopNestInsertionPoint(AffineForOp srcForOp, AffineForOp dstForOp)
static unsigned getMaxLoopDepth(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps)
Returns the maximum loop depth at which we could fuse producer loop 'srcForOp' into consumer loop 'ds...
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
OpListType::iterator iterator
Definition: Block.h:140
OpListType::reverse_iterator reverse_iterator
Definition: Block.h:141
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookupOrNull(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:58
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:784
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66
This class helps build Operations.
Definition: Builders.h:205
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:544
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
Definition: Operation.cpp:386
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:798
user_range getUsers()
Returns a range of all users.
Definition: Operation.h:874
result_range getResults()
Definition: Operation.h:415
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
static WalkResult advance()
Definition: Visitors.h:51
static WalkResult interrupt()
Definition: Visitors.h:50
Describes the fusion strategy to be used in the Affine loop fusion utilities.
StrategyEnum getStrategy() const
Returns the fusion strategy.
Value getSiblingFusionMemRef() const
Returns the memref attached to this sibling fusion strategy.
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:118
bool getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats, AffineForOp dstForOp, LoopNestStats &dstStats, const ComputationSliceState &slice, int64_t *computeCost)
Computes and returns in 'computeCost', the total compute cost of fusing the 'slice' of the loop nest ...
SliceComputationResult computeSliceUnion(ArrayRef< Operation * > opsA, ArrayRef< Operation * > opsB, unsigned loopDepth, unsigned numCommonLoops, bool isBackwardSlice, ComputationSliceState *sliceUnion)
Computes in 'sliceUnion' the union of all slice bounds computed at 'loopDepth' between all dependent ...
Definition: Utils.cpp:1553
bool isLoopParallelAndContainsReduction(AffineForOp forOp)
Returns whether a loop is a parallel loop and contains a reduction loop.
Definition: Utils.cpp:2101
unsigned getNumCommonSurroundingLoops(Operation &a, Operation &b)
Returns the number of surrounding loops common to both A and B.
Definition: Utils.cpp:2030
void gatherProducerConsumerMemrefs(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps, DenseSet< Value > &producerConsumerMemrefs)
Returns in 'producerConsumerMemrefs' the memrefs involved in a producer-consumer dependence between w...
AffineForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
Definition: AffineOps.cpp:2587
DependenceResult checkMemrefAccessDependence(const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, unsigned loopDepth, FlatAffineValueConstraints *dependenceConstraints=nullptr, SmallVector< DependenceComponent, 2 > *dependenceComponents=nullptr, bool allowRAR=false)
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1449
int64_t getComputeCost(AffineForOp forOp, LoopNestStats &stats)
Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
void fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, const ComputationSliceState &srcSlice, bool isInnermostSiblingInsertionFusion=false)
Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point and source slice loop bo...
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:762
unsigned getInnermostCommonLoopDepth(ArrayRef< Operation * > ops, SmallVectorImpl< AffineForOp > *surroundingLoops=nullptr)
Returns the innermost common loop depth for the set of operations in 'ops'.
Definition: Utils.cpp:1521
bool getLoopNestStats(AffineForOp forOp, LoopNestStats *stats)
Collect loop nest statistics (eg.
bool hasDependence(DependenceResult result)
Utility function that returns true if the provided DependenceResult corresponds to a dependence resul...
uint64_t getSliceIterationCount(const llvm::SmallDenseMap< Operation *, uint64_t, 8 > &sliceTripCountMap)
Return the number of iterations for the slicetripCountMap provided.
Definition: Utils.cpp:1778
FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, unsigned dstLoopDepth, ComputationSliceState *srcSlice, FusionStrategy fusionStrategy=FusionStrategy::Generic)
Checks the feasibility of fusing the loop nest rooted at 'srcForOp' into the loop nest rooted at 'dst...
bool buildSliceTripCountMap(const ComputationSliceState &slice, llvm::SmallDenseMap< Operation *, uint64_t, 8 > *tripCountMap)
Builds a map 'tripCountMap' from AffineForOp to constant trip count for loop nest surrounding represe...
Definition: Utils.cpp:1740
Include the generated interface declarations.
SetVector< Operation * > topologicalSort(const SetVector< Operation * > &toSort)
Sorts all operations in toSort topologically while also considering region semantics.
void getForwardSlice(Operation *op, SetVector< Operation * > *forwardSlice, const ForwardSliceOptions &options={})
Fills forwardSlice with the computed forward slice (i.e.
ComputationSliceState aggregates loop IVs, loop bound AffineMaps and their associated operands for a ...
Definition: Utils.h:310
SmallVector< Value, 4 > ivs
Definition: Utils.h:313
std::vector< SmallVector< Value, 4 > > ubOperands
Definition: Utils.h:321
SmallVector< AffineMap, 4 > ubs
Definition: Utils.h:317
SmallVector< AffineMap, 4 > lbs
Definition: Utils.h:315
std::vector< SmallVector< Value, 4 > > lbOperands
Definition: Utils.h:319
Checks whether two accesses to the same memref access the same element.
LoopNestStats aggregates various per-loop statistics (eg.
DenseMap< Operation *, uint64_t > opCountMap
Map from AffineForOp to count of operations in its loop body.
DenseMap< Operation *, SmallVector< AffineForOp, 2 > > loopMap
Map from AffineForOp to immediate child AffineForOps in its loop body.
DenseMap< Operation *, uint64_t > tripCountMap
Map from AffineForOp to its constant trip count.
Encapsulates a memref load or store access information.
Enumerates different result statuses of slice computation by computeSliceUnion
Definition: Utils.h:297
enum mlir::affine::SliceComputationResult::ResultEnum value
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.