MLIR  22.0.0git
LoopPipelining.cpp
Go to the documentation of this file.
1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop software pipelining
10 //
11 //===----------------------------------------------------------------------===//
12 
18 #include "mlir/IR/IRMapping.h"
19 #include "mlir/IR/PatternMatch.h"
21 #include "llvm/ADT/MapVector.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/MathExtras.h"
24 
25 #define DEBUG_TYPE "scf-loop-pipelining"
26 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
27 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
28 
29 using namespace mlir;
30 using namespace mlir::scf;
31 
32 namespace {
33 
34 /// Helper to keep internal information during pipelining transformation.
35 struct LoopPipelinerInternal {
36  /// Coarse liverange information for ops used across stages.
37  struct LiverangeInfo {
38  unsigned lastUseStage = 0;
39  unsigned defStage = 0;
40  };
41 
42 protected:
43  ForOp forOp;
44  unsigned maxStage = 0;
46  std::vector<Operation *> opOrder;
47  Value ub;
48  Value lb;
49  Value step;
50  bool dynamicLoop;
51  PipeliningOption::AnnotationlFnType annotateFn = nullptr;
52  bool peelEpilogue;
53  PipeliningOption::PredicateOpFn predicateFn = nullptr;
54 
55  // When peeling the kernel we generate several version of each value for
56  // different stage of the prologue. This map tracks the mapping between
57  // original Values in the loop and the different versions
58  // peeled from the loop.
60 
61  /// Assign a value to `valueMapping`, this means `val` represents the version
62  /// `idx` of `key` in the epilogue.
63  void setValueMapping(Value key, Value el, int64_t idx);
64 
65  /// Return the defining op of the given value, if the Value is an argument of
66  /// the loop return the associated defining op in the loop and its distance to
67  /// the Value.
68  std::pair<Operation *, int64_t> getDefiningOpAndDistance(Value value);
69 
70  /// Return true if the schedule is possible and return false otherwise. A
71  /// schedule is correct if all definitions are scheduled before uses.
72  bool verifySchedule();
73 
74 public:
75  /// Initalize the information for the given `op`, return true if it
76  /// satisfies the pre-condition to apply pipelining.
77  bool initializeLoopInfo(ForOp op, const PipeliningOption &options);
78  /// Emits the prologue, this creates `maxStage - 1` part which will contain
79  /// operations from stages [0; i], where i is the part index.
80  LogicalResult emitPrologue(RewriterBase &rewriter);
81  /// Gather liverange information for Values that are used in a different stage
82  /// than its definition.
83  llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues();
84  scf::ForOp createKernelLoop(
85  const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
86  RewriterBase &rewriter,
87  llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap);
88  /// Emits the pipelined kernel. This clones loop operations following user
89  /// order and remaps operands defined in a different stage as their use.
90  LogicalResult createKernel(
91  scf::ForOp newForOp,
92  const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
93  const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
94  RewriterBase &rewriter);
95  /// Emits the epilogue, this creates `maxStage - 1` part which will contain
96  /// operations from stages [i; maxStage], where i is the part index.
97  LogicalResult emitEpilogue(RewriterBase &rewriter,
98  llvm::SmallVector<Value> &returnValues);
99 };
100 
101 bool LoopPipelinerInternal::initializeLoopInfo(
102  ForOp op, const PipeliningOption &options) {
103  LDBG("Start initializeLoopInfo");
104  forOp = op;
105  ub = forOp.getUpperBound();
106  lb = forOp.getLowerBound();
107  step = forOp.getStep();
108 
109  std::vector<std::pair<Operation *, unsigned>> schedule;
110  options.getScheduleFn(forOp, schedule);
111  if (schedule.empty()) {
112  LDBG("--empty schedule -> BAIL");
113  return false;
114  }
115 
116  opOrder.reserve(schedule.size());
117  for (auto &opSchedule : schedule) {
118  maxStage = std::max(maxStage, opSchedule.second);
119  stages[opSchedule.first] = opSchedule.second;
120  opOrder.push_back(opSchedule.first);
121  }
122 
123  dynamicLoop = true;
124  auto upperBoundCst = getConstantIntValue(ub);
125  auto lowerBoundCst = getConstantIntValue(lb);
126  auto stepCst = getConstantIntValue(step);
127  if (!upperBoundCst || !lowerBoundCst || !stepCst) {
128  if (!options.supportDynamicLoops) {
129  LDBG("--dynamic loop not supported -> BAIL");
130  return false;
131  }
132  } else {
133  int64_t ubImm = upperBoundCst.value();
134  int64_t lbImm = lowerBoundCst.value();
135  int64_t stepImm = stepCst.value();
136  if (stepImm <= 0) {
137  LDBG("--invalid loop step -> BAIL");
138  return false;
139  }
140  int64_t numIteration = llvm::divideCeilSigned(ubImm - lbImm, stepImm);
141  if (numIteration >= maxStage) {
142  dynamicLoop = false;
143  } else if (!options.supportDynamicLoops) {
144  LDBG("--fewer loop iterations than pipeline stages -> BAIL");
145  return false;
146  }
147  }
148  peelEpilogue = options.peelEpilogue;
149  predicateFn = options.predicateFn;
150  if ((!peelEpilogue || dynamicLoop) && predicateFn == nullptr) {
151  LDBG("--no epilogue or predicate set -> BAIL");
152  return false;
153  }
154 
155  // All operations need to have a stage.
156  for (Operation &op : forOp.getBody()->without_terminator()) {
157  if (!stages.contains(&op)) {
158  op.emitOpError("not assigned a pipeline stage");
159  LDBG("--op not assigned a pipeline stage: " << op << " -> BAIL");
160  return false;
161  }
162  }
163 
164  if (!verifySchedule()) {
165  LDBG("--invalid schedule: " << op << " -> BAIL");
166  return false;
167  }
168 
169  // Currently, we do not support assigning stages to ops in nested regions. The
170  // block of all operations assigned a stage should be the single `scf.for`
171  // body block.
172  for (const auto &[op, stageNum] : stages) {
173  (void)stageNum;
174  if (op == forOp.getBody()->getTerminator()) {
175  op->emitError("terminator should not be assigned a stage");
176  LDBG("--terminator should not be assigned stage: " << *op << " -> BAIL");
177  return false;
178  }
179  if (op->getBlock() != forOp.getBody()) {
180  op->emitOpError("the owning Block of all operations assigned a stage "
181  "should be the loop body block");
182  LDBG("--the owning Block of all operations assigned a stage "
183  "should be the loop body block: "
184  << *op << " -> BAIL");
185  return false;
186  }
187  }
188 
189  // Support only loop-carried dependencies with a distance of one iteration or
190  // those defined outside of the loop. This means that any dependency within a
191  // loop should either be on the immediately preceding iteration, the current
192  // iteration, or on variables whose values are set before entering the loop.
193  if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(),
194  [this](Value operand) {
195  Operation *def = operand.getDefiningOp();
196  return !def ||
197  (!stages.contains(def) && forOp->isAncestor(def));
198  })) {
199  LDBG("--only support loop carried dependency with a distance of 1 or "
200  "defined outside of the loop -> BAIL");
201  return false;
202  }
203  annotateFn = options.annotateFn;
204  return true;
205 }
206 
207 /// Find operands of all the nested operations within `op`.
208 static SetVector<Value> getNestedOperands(Operation *op) {
209  SetVector<Value> operands;
210  op->walk([&](Operation *nestedOp) {
211  operands.insert_range(nestedOp->getOperands());
212  });
213  return operands;
214 }
215 
216 /// Compute unrolled cycles of each op (consumer) and verify that each op is
217 /// scheduled after its operands (producers) while adjusting for the distance
218 /// between producer and consumer.
219 bool LoopPipelinerInternal::verifySchedule() {
220  int64_t numCylesPerIter = opOrder.size();
221  // Pre-compute the unrolled cycle of each op.
222  DenseMap<Operation *, int64_t> unrolledCyles;
223  for (int64_t cycle = 0; cycle < numCylesPerIter; cycle++) {
224  Operation *def = opOrder[cycle];
225  auto it = stages.find(def);
226  assert(it != stages.end());
227  int64_t stage = it->second;
228  unrolledCyles[def] = cycle + stage * numCylesPerIter;
229  }
230  for (Operation *consumer : opOrder) {
231  int64_t consumerCycle = unrolledCyles[consumer];
232  for (Value operand : getNestedOperands(consumer)) {
233  auto [producer, distance] = getDefiningOpAndDistance(operand);
234  if (!producer)
235  continue;
236  auto it = unrolledCyles.find(producer);
237  // Skip producer coming from outside the loop.
238  if (it == unrolledCyles.end())
239  continue;
240  int64_t producerCycle = it->second;
241  if (consumerCycle < producerCycle - numCylesPerIter * distance) {
242  consumer->emitError("operation scheduled before its operands");
243  return false;
244  }
245  }
246  }
247  return true;
248 }
249 
250 /// Clone `op` and call `callback` on the cloned op's oeprands as well as any
251 /// operands of nested ops that:
252 /// 1) aren't defined within the new op or
253 /// 2) are block arguments.
254 static Operation *
255 cloneAndUpdateOperands(RewriterBase &rewriter, Operation *op,
256  function_ref<void(OpOperand *newOperand)> callback) {
257  Operation *clone = rewriter.clone(*op);
258  clone->walk<WalkOrder::PreOrder>([&](Operation *nested) {
259  // 'clone' itself will be visited first.
260  for (OpOperand &operand : nested->getOpOperands()) {
261  Operation *def = operand.get().getDefiningOp();
262  if ((def && !clone->isAncestor(def)) || isa<BlockArgument>(operand.get()))
263  callback(&operand);
264  }
265  });
266  return clone;
267 }
268 
269 LogicalResult LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
270  // Initialize the iteration argument to the loop initial values.
271  for (auto [arg, operand] :
272  llvm::zip(forOp.getRegionIterArgs(), forOp.getInitsMutable())) {
273  setValueMapping(arg, operand.get(), 0);
274  }
275  auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
276  Location loc = forOp.getLoc();
277  SmallVector<Value> predicates(maxStage);
278  for (int64_t i = 0; i < maxStage; i++) {
279  if (dynamicLoop) {
280  Type t = ub.getType();
281  // pred = ub > lb + (i * step)
282  Value iv = rewriter.create<arith::AddIOp>(
283  loc, lb,
284  rewriter.create<arith::MulIOp>(
285  loc, step,
286  rewriter.create<arith::ConstantOp>(
287  loc, rewriter.getIntegerAttr(t, i))));
288  predicates[i] = rewriter.create<arith::CmpIOp>(
289  loc, arith::CmpIPredicate::slt, iv, ub);
290  }
291 
292  // special handling for induction variable as the increment is implicit.
293  // iv = lb + i * step
294  Type t = lb.getType();
295  Value iv = rewriter.create<arith::AddIOp>(
296  loc, lb,
297  rewriter.create<arith::MulIOp>(
298  loc, step,
299  rewriter.create<arith::ConstantOp>(loc,
300  rewriter.getIntegerAttr(t, i))));
301  setValueMapping(forOp.getInductionVar(), iv, i);
302  for (Operation *op : opOrder) {
303  if (stages[op] > i)
304  continue;
305  Operation *newOp =
306  cloneAndUpdateOperands(rewriter, op, [&](OpOperand *newOperand) {
307  auto it = valueMapping.find(newOperand->get());
308  if (it != valueMapping.end()) {
309  Value replacement = it->second[i - stages[op]];
310  newOperand->set(replacement);
311  }
312  });
313  int predicateIdx = i - stages[op];
314  if (predicates[predicateIdx]) {
315  OpBuilder::InsertionGuard insertGuard(rewriter);
316  newOp = predicateFn(rewriter, newOp, predicates[predicateIdx]);
317  if (newOp == nullptr)
318  return failure();
319  }
320  if (annotateFn)
321  annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i);
322  for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
323  Value source = newOp->getResult(destId);
324  // If the value is a loop carried dependency update the loop argument
325  for (OpOperand &operand : yield->getOpOperands()) {
326  if (operand.get() != op->getResult(destId))
327  continue;
328  if (predicates[predicateIdx] &&
329  !forOp.getResult(operand.getOperandNumber()).use_empty()) {
330  // If the value is used outside the loop, we need to make sure we
331  // return the correct version of it.
332  Value prevValue = valueMapping
333  [forOp.getRegionIterArgs()[operand.getOperandNumber()]]
334  [i - stages[op]];
335  source = rewriter.create<arith::SelectOp>(
336  loc, predicates[predicateIdx], source, prevValue);
337  }
338  setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
339  source, i - stages[op] + 1);
340  }
341  setValueMapping(op->getResult(destId), newOp->getResult(destId),
342  i - stages[op]);
343  }
344  }
345  }
346  return success();
347 }
348 
349 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
350 LoopPipelinerInternal::analyzeCrossStageValues() {
351  llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues;
352  for (Operation *op : opOrder) {
353  unsigned stage = stages[op];
354 
355  auto analyzeOperand = [&](OpOperand &operand) {
356  auto [def, distance] = getDefiningOpAndDistance(operand.get());
357  if (!def)
358  return;
359  auto defStage = stages.find(def);
360  if (defStage == stages.end() || defStage->second == stage ||
361  defStage->second == stage + distance)
362  return;
363  assert(stage > defStage->second);
364  LiverangeInfo &info = crossStageValues[operand.get()];
365  info.defStage = defStage->second;
366  info.lastUseStage = std::max(info.lastUseStage, stage);
367  };
368 
369  for (OpOperand &operand : op->getOpOperands())
370  analyzeOperand(operand);
371  visitUsedValuesDefinedAbove(op->getRegions(), [&](OpOperand *operand) {
372  analyzeOperand(*operand);
373  });
374  }
375  return crossStageValues;
376 }
377 
378 std::pair<Operation *, int64_t>
379 LoopPipelinerInternal::getDefiningOpAndDistance(Value value) {
380  int64_t distance = 0;
381  if (auto arg = dyn_cast<BlockArgument>(value)) {
382  if (arg.getOwner() != forOp.getBody())
383  return {nullptr, 0};
384  // Ignore induction variable.
385  if (arg.getArgNumber() == 0)
386  return {nullptr, 0};
387  distance++;
388  value =
389  forOp.getBody()->getTerminator()->getOperand(arg.getArgNumber() - 1);
390  }
391  Operation *def = value.getDefiningOp();
392  if (!def)
393  return {nullptr, 0};
394  return {def, distance};
395 }
396 
397 scf::ForOp LoopPipelinerInternal::createKernelLoop(
398  const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
399  &crossStageValues,
400  RewriterBase &rewriter,
401  llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) {
402  // Creates the list of initial values associated to values used across
403  // stages. The initial values come from the prologue created above.
404  // Keep track of the kernel argument associated to each version of the
405  // values passed to the kernel.
406  llvm::SmallVector<Value> newLoopArg;
407  // For existing loop argument initialize them with the right version from the
408  // prologue.
409  for (const auto &retVal :
410  llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
411  Operation *def = retVal.value().getDefiningOp();
412  assert(def && "Only support loop carried dependencies of distance of 1 or "
413  "outside the loop");
414  auto defStage = stages.find(def);
415  if (defStage != stages.end()) {
416  Value valueVersion =
417  valueMapping[forOp.getRegionIterArgs()[retVal.index()]]
418  [maxStage - defStage->second];
419  assert(valueVersion);
420  newLoopArg.push_back(valueVersion);
421  } else {
422  newLoopArg.push_back(forOp.getInitArgs()[retVal.index()]);
423  }
424  }
425  for (auto escape : crossStageValues) {
426  LiverangeInfo &info = escape.second;
427  Value value = escape.first;
428  for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage;
429  stageIdx++) {
430  Value valueVersion =
431  valueMapping[value][maxStage - info.lastUseStage + stageIdx];
432  assert(valueVersion);
433  newLoopArg.push_back(valueVersion);
434  loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage -
435  stageIdx)] = newLoopArg.size() - 1;
436  }
437  }
438 
439  // Create the new kernel loop. When we peel the epilgue we need to peel
440  // `numStages - 1` iterations. Then we adjust the upper bound to remove those
441  // iterations.
442  Value newUb = forOp.getUpperBound();
443  if (peelEpilogue) {
444  Type t = ub.getType();
445  Location loc = forOp.getLoc();
446  // newUb = ub - maxStage * step
447  Value maxStageValue = rewriter.create<arith::ConstantOp>(
448  loc, rewriter.getIntegerAttr(t, maxStage));
449  Value maxStageByStep =
450  rewriter.create<arith::MulIOp>(loc, step, maxStageValue);
451  newUb = rewriter.create<arith::SubIOp>(loc, ub, maxStageByStep);
452  }
453  auto newForOp =
454  rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb,
455  forOp.getStep(), newLoopArg);
456  // When there are no iter args, the loop body terminator will be created.
457  // Since we always create it below, remove the terminator if it was created.
458  if (!newForOp.getBody()->empty())
459  rewriter.eraseOp(newForOp.getBody()->getTerminator());
460  return newForOp;
461 }
462 
463 LogicalResult LoopPipelinerInternal::createKernel(
464  scf::ForOp newForOp,
465  const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
466  &crossStageValues,
467  const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
468  RewriterBase &rewriter) {
469  valueMapping.clear();
470 
471  // Create the kernel, we clone instruction based on the order given by
472  // user and remap operands coming from a previous stages.
473  rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin());
474  IRMapping mapping;
475  mapping.map(forOp.getInductionVar(), newForOp.getInductionVar());
476  for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) {
477  mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]);
478  }
479  SmallVector<Value> predicates(maxStage + 1, nullptr);
480  if (!peelEpilogue) {
481  // Create a predicate for each stage except the last stage.
482  Location loc = newForOp.getLoc();
483  Type t = ub.getType();
484  for (unsigned i = 0; i < maxStage; i++) {
485  // c = ub - (maxStage - i) * step
486  Value c = rewriter.create<arith::SubIOp>(
487  loc, ub,
488  rewriter.create<arith::MulIOp>(
489  loc, step,
490  rewriter.create<arith::ConstantOp>(
491  loc, rewriter.getIntegerAttr(t, int64_t(maxStage - i)))));
492 
493  Value pred = rewriter.create<arith::CmpIOp>(
494  newForOp.getLoc(), arith::CmpIPredicate::slt,
495  newForOp.getInductionVar(), c);
496  predicates[i] = pred;
497  }
498  }
499  for (Operation *op : opOrder) {
500  int64_t useStage = stages[op];
501  auto *newOp = rewriter.clone(*op, mapping);
502  SmallVector<OpOperand *> operands;
503  // Collect all the operands for the cloned op and its nested ops.
504  op->walk([&operands](Operation *nestedOp) {
505  for (OpOperand &operand : nestedOp->getOpOperands()) {
506  operands.push_back(&operand);
507  }
508  });
509  for (OpOperand *operand : operands) {
510  Operation *nestedNewOp = mapping.lookup(operand->getOwner());
511  // Special case for the induction variable uses. We replace it with a
512  // version incremented based on the stage where it is used.
513  if (operand->get() == forOp.getInductionVar()) {
514  rewriter.setInsertionPoint(newOp);
515 
516  // offset = (maxStage - stages[op]) * step
517  Type t = step.getType();
518  Value offset = rewriter.create<arith::MulIOp>(
519  forOp.getLoc(), step,
520  rewriter.create<arith::ConstantOp>(
521  forOp.getLoc(),
522  rewriter.getIntegerAttr(t, maxStage - stages[op])));
523  Value iv = rewriter.create<arith::AddIOp>(
524  forOp.getLoc(), newForOp.getInductionVar(), offset);
525  nestedNewOp->setOperand(operand->getOperandNumber(), iv);
526  rewriter.setInsertionPointAfter(newOp);
527  continue;
528  }
529  Value source = operand->get();
530  auto arg = dyn_cast<BlockArgument>(source);
531  if (arg && arg.getOwner() == forOp.getBody()) {
532  Value ret = forOp.getBody()->getTerminator()->getOperand(
533  arg.getArgNumber() - 1);
534  Operation *dep = ret.getDefiningOp();
535  if (!dep)
536  continue;
537  auto stageDep = stages.find(dep);
538  if (stageDep == stages.end() || stageDep->second == useStage)
539  continue;
540  // If the value is a loop carried value coming from stage N + 1 remap,
541  // it will become a direct use.
542  if (stageDep->second == useStage + 1) {
543  nestedNewOp->setOperand(operand->getOperandNumber(),
544  mapping.lookupOrDefault(ret));
545  continue;
546  }
547  source = ret;
548  }
549  // For operands defined in a previous stage we need to remap it to use
550  // the correct region argument. We look for the right version of the
551  // Value based on the stage where it is used.
552  Operation *def = source.getDefiningOp();
553  if (!def)
554  continue;
555  auto stageDef = stages.find(def);
556  if (stageDef == stages.end() || stageDef->second == useStage)
557  continue;
558  auto remap = loopArgMap.find(
559  std::make_pair(operand->get(), useStage - stageDef->second));
560  assert(remap != loopArgMap.end());
561  nestedNewOp->setOperand(operand->getOperandNumber(),
562  newForOp.getRegionIterArgs()[remap->second]);
563  }
564 
565  if (predicates[useStage]) {
566  OpBuilder::InsertionGuard insertGuard(rewriter);
567  newOp = predicateFn(rewriter, newOp, predicates[useStage]);
568  if (!newOp)
569  return failure();
570  // Remap the results to the new predicated one.
571  for (auto values : llvm::zip(op->getResults(), newOp->getResults()))
572  mapping.map(std::get<0>(values), std::get<1>(values));
573  }
574  if (annotateFn)
575  annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0);
576  }
577 
578  // Collect the Values that need to be returned by the forOp. For each
579  // value we need to have `LastUseStage - DefStage` number of versions
580  // returned.
581  // We create a mapping between original values and the associated loop
582  // returned values that will be needed by the epilogue.
583  llvm::SmallVector<Value> yieldOperands;
584  for (OpOperand &yieldOperand :
585  forOp.getBody()->getTerminator()->getOpOperands()) {
586  Value source = mapping.lookupOrDefault(yieldOperand.get());
587  // When we don't peel the epilogue and the yield value is used outside the
588  // loop we need to make sure we return the version from numStages -
589  // defStage.
590  if (!peelEpilogue &&
591  !forOp.getResult(yieldOperand.getOperandNumber()).use_empty()) {
592  Operation *def = getDefiningOpAndDistance(yieldOperand.get()).first;
593  if (def) {
594  auto defStage = stages.find(def);
595  if (defStage != stages.end() && defStage->second < maxStage) {
596  Value pred = predicates[defStage->second];
597  source = rewriter.create<arith::SelectOp>(
598  pred.getLoc(), pred, source,
599  newForOp.getBody()
600  ->getArguments()[yieldOperand.getOperandNumber() + 1]);
601  }
602  }
603  }
604  yieldOperands.push_back(source);
605  }
606 
607  for (auto &it : crossStageValues) {
608  int64_t version = maxStage - it.second.lastUseStage + 1;
609  unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage;
610  // add the original version to yield ops.
611  // If there is a live range spanning across more than 2 stages we need to
612  // add extra arg.
613  for (unsigned i = 1; i < numVersionReturned; i++) {
614  setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
615  version++);
616  yieldOperands.push_back(
617  newForOp.getBody()->getArguments()[yieldOperands.size() + 1 +
618  newForOp.getNumInductionVars()]);
619  }
620  setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
621  version++);
622  yieldOperands.push_back(mapping.lookupOrDefault(it.first));
623  }
624  // Map the yield operand to the forOp returned value.
625  for (const auto &retVal :
626  llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
627  Operation *def = retVal.value().getDefiningOp();
628  assert(def && "Only support loop carried dependencies of distance of 1 or "
629  "defined outside the loop");
630  auto defStage = stages.find(def);
631  if (defStage == stages.end()) {
632  for (unsigned int stage = 1; stage <= maxStage; stage++)
633  setValueMapping(forOp.getRegionIterArgs()[retVal.index()],
634  retVal.value(), stage);
635  } else if (defStage->second > 0) {
636  setValueMapping(forOp.getRegionIterArgs()[retVal.index()],
637  newForOp->getResult(retVal.index()),
638  maxStage - defStage->second + 1);
639  }
640  }
641  rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands);
642  return success();
643 }
644 
645 LogicalResult
646 LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter,
647  llvm::SmallVector<Value> &returnValues) {
648  Location loc = forOp.getLoc();
649  Type t = lb.getType();
650 
651  // Emit different versions of the induction variable. They will be
652  // removed by dead code if not used.
653 
654  auto createConst = [&](int v) {
655  return rewriter.create<arith::ConstantOp>(loc,
656  rewriter.getIntegerAttr(t, v));
657  };
658 
659  // total_iterations = cdiv(range_diff, step);
660  // - range_diff = ub - lb
661  // - total_iterations = (range_diff + step + (step < 0 ? 1 : -1)) / step
662  Value zero = createConst(0);
663  Value one = createConst(1);
664  Value stepLessZero = rewriter.create<arith::CmpIOp>(
665  loc, arith::CmpIPredicate::slt, step, zero);
666  Value stepDecr =
667  rewriter.create<arith::SelectOp>(loc, stepLessZero, one, createConst(-1));
668 
669  Value rangeDiff = rewriter.create<arith::SubIOp>(loc, ub, lb);
670  Value rangeIncrStep = rewriter.create<arith::AddIOp>(loc, rangeDiff, step);
671  Value rangeDecr =
672  rewriter.create<arith::AddIOp>(loc, rangeIncrStep, stepDecr);
673  Value totalIterations = rewriter.create<arith::DivSIOp>(loc, rangeDecr, step);
674 
675  // If total_iters < max_stage, start the epilogue at zero to match the
676  // ramp-up in the prologue.
677  // start_iter = max(0, total_iters - max_stage)
678  Value iterI = rewriter.create<arith::SubIOp>(loc, totalIterations,
679  createConst(maxStage));
680  iterI = rewriter.create<arith::MaxSIOp>(loc, zero, iterI);
681 
682  // Capture predicates for dynamic loops.
683  SmallVector<Value> predicates(maxStage + 1);
684 
685  for (int64_t i = 1; i <= maxStage; i++) {
686  // newLastIter = lb + step * iterI
687  Value newlastIter = rewriter.create<arith::AddIOp>(
688  loc, lb, rewriter.create<arith::MulIOp>(loc, step, iterI));
689 
690  setValueMapping(forOp.getInductionVar(), newlastIter, i);
691 
692  // increment to next iterI
693  iterI = rewriter.create<arith::AddIOp>(loc, iterI, one);
694 
695  if (dynamicLoop) {
696  // Disable stages when `i` is greater than total_iters.
697  // pred = total_iters >= i
698  predicates[i] = rewriter.create<arith::CmpIOp>(
699  loc, arith::CmpIPredicate::sge, totalIterations, createConst(i));
700  }
701  }
702 
703  // Emit `maxStage - 1` epilogue part that includes operations from stages
704  // [i; maxStage].
705  for (int64_t i = 1; i <= maxStage; i++) {
706  SmallVector<std::pair<Value, unsigned>> returnMap(returnValues.size());
707  for (Operation *op : opOrder) {
708  if (stages[op] < i)
709  continue;
710  unsigned currentVersion = maxStage - stages[op] + i;
711  unsigned nextVersion = currentVersion + 1;
712  Operation *newOp =
713  cloneAndUpdateOperands(rewriter, op, [&](OpOperand *newOperand) {
714  auto it = valueMapping.find(newOperand->get());
715  if (it != valueMapping.end()) {
716  Value replacement = it->second[currentVersion];
717  newOperand->set(replacement);
718  }
719  });
720  if (dynamicLoop) {
721  OpBuilder::InsertionGuard insertGuard(rewriter);
722  newOp = predicateFn(rewriter, newOp, predicates[currentVersion]);
723  if (!newOp)
724  return failure();
725  }
726  if (annotateFn)
727  annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1);
728 
729  for (auto [opRes, newRes] :
730  llvm::zip(op->getResults(), newOp->getResults())) {
731  setValueMapping(opRes, newRes, currentVersion);
732  // If the value is a loop carried dependency update the loop argument
733  // mapping and keep track of the last version to replace the original
734  // forOp uses.
735  for (OpOperand &operand :
736  forOp.getBody()->getTerminator()->getOpOperands()) {
737  if (operand.get() != opRes)
738  continue;
739  // If the version is greater than maxStage it means it maps to the
740  // original forOp returned value.
741  unsigned ri = operand.getOperandNumber();
742  returnValues[ri] = newRes;
743  Value mapVal = forOp.getRegionIterArgs()[ri];
744  returnMap[ri] = std::make_pair(mapVal, currentVersion);
745  if (nextVersion <= maxStage)
746  setValueMapping(mapVal, newRes, nextVersion);
747  }
748  }
749  }
750  if (dynamicLoop) {
751  // Select return values from this stage (live outs) based on predication.
752  // If the stage is valid select the peeled value, else use previous stage
753  // value.
754  for (auto pair : llvm::enumerate(returnValues)) {
755  unsigned ri = pair.index();
756  auto [mapVal, currentVersion] = returnMap[ri];
757  if (mapVal) {
758  unsigned nextVersion = currentVersion + 1;
759  Value pred = predicates[currentVersion];
760  Value prevValue = valueMapping[mapVal][currentVersion];
761  auto selOp = rewriter.create<arith::SelectOp>(loc, pred, pair.value(),
762  prevValue);
763  returnValues[ri] = selOp;
764  if (nextVersion <= maxStage)
765  setValueMapping(mapVal, selOp, nextVersion);
766  }
767  }
768  }
769  }
770  return success();
771 }
772 
773 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) {
774  auto it = valueMapping.find(key);
775  // If the value is not in the map yet add a vector big enough to store all
776  // versions.
777  if (it == valueMapping.end())
778  it =
779  valueMapping
780  .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1)))
781  .first;
782  it->second[idx] = el;
783 }
784 
785 } // namespace
786 
787 FailureOr<ForOp> mlir::scf::pipelineForLoop(RewriterBase &rewriter, ForOp forOp,
788  const PipeliningOption &options,
789  bool *modifiedIR) {
790  if (modifiedIR)
791  *modifiedIR = false;
792  LoopPipelinerInternal pipeliner;
793  if (!pipeliner.initializeLoopInfo(forOp, options))
794  return failure();
795 
796  if (modifiedIR)
797  *modifiedIR = true;
798 
799  // 1. Emit prologue.
800  if (failed(pipeliner.emitPrologue(rewriter)))
801  return failure();
802 
803  // 2. Track values used across stages. When a value cross stages it will
804  // need to be passed as loop iteration arguments.
805  // We first collect the values that are used in a different stage than where
806  // they are defined.
807  llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
808  crossStageValues = pipeliner.analyzeCrossStageValues();
809 
810  // Mapping between original loop values used cross stage and the block
811  // arguments associated after pipelining. A Value may map to several
812  // arguments if its liverange spans across more than 2 stages.
813  llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap;
814  // 3. Create the new kernel loop and return the block arguments mapping.
815  ForOp newForOp =
816  pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap);
817  // Create the kernel block, order ops based on user choice and remap
818  // operands.
819  if (failed(pipeliner.createKernel(newForOp, crossStageValues, loopArgMap,
820  rewriter)))
821  return failure();
822 
823  llvm::SmallVector<Value> returnValues =
824  newForOp.getResults().take_front(forOp->getNumResults());
825  if (options.peelEpilogue) {
826  // 4. Emit the epilogue after the new forOp.
827  rewriter.setInsertionPointAfter(newForOp);
828  if (failed(pipeliner.emitEpilogue(rewriter, returnValues)))
829  return failure();
830  }
831  // 5. Erase the original loop and replace the uses with the epilogue output.
832  if (forOp->getNumResults() > 0)
833  rewriter.replaceOp(forOp, returnValues);
834  else
835  rewriter.eraseOp(forOp);
836 
837  return newForOp;
838 }
839 
843 }
static Value createConst(Location loc, Type type, int value, PatternRewriter &rewriter)
Create an integer or index constant.
Definition: ExpandOps.cpp:28
#define LDBG(X)
static llvm::ManagedStatic< PassManagerOptions > options
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
IntegerAttr getIntegerAttr(Type type, int64_t value)
Definition: Builders.cpp:223
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:65
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:160
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:346
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:548
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:396
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:452
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410
This class represents an operand of an operation.
Definition: Value.h:257
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
void setOperand(unsigned idx, Value value)
Definition: Operation.h:351
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:407
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:797
MutableArrayRef< OpOperand > getOpOperands()
Definition: Operation.h:383
operand_range getOperands()
Returns an iterator on the underlying Value's.
Definition: Operation.h:378
bool isAncestor(Operation *other)
Return true if this operation is an ancestor of the other operation.
Definition: Operation.h:263
result_range getResults()
Definition: Operation.h:415
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:358
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
void populateSCFLoopPipeliningPatterns(RewritePatternSet &patterns, const PipeliningOption &options)
Populate patterns for SCF software pipelining transformation.
FailureOr< ForOp > pipelineForLoop(RewriterBase &rewriter, ForOp forOp, const PipeliningOption &options, bool *modifiedIR=nullptr)
Generate a pipelined version of the scf.for loop based on the schedule given as option.
Include the generated interface declarations.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
const FrozenRewritePatternSet & patterns
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
void visitUsedValuesDefinedAbove(Region &region, Region &limit, function_ref< void(OpOperand *)> callback)
Calls callback for each use of a value within region or its descendants that was defined at the ances...
Definition: RegionUtils.cpp:40
Options to dictate how loops should be pipelined.
Definition: Transforms.h:123
std::function< void(Operation *, PipelinerPart, unsigned)> AnnotationlFnType
Lambda called by the pipeliner to allow the user to annotate the IR while it is generated.
Definition: Transforms.h:141
std::function< Operation *(RewriterBase &, Operation *, Value)> PredicateOpFn
Definition: Transforms.h:164