MLIR  21.0.0git
LoopPipelining.cpp
Go to the documentation of this file.
1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop software pipelining
10 //
11 //===----------------------------------------------------------------------===//
12 
18 #include "mlir/IR/IRMapping.h"
19 #include "mlir/IR/PatternMatch.h"
21 #include "llvm/ADT/MapVector.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/MathExtras.h"
24 
25 #define DEBUG_TYPE "scf-loop-pipelining"
26 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
27 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
28 
29 using namespace mlir;
30 using namespace mlir::scf;
31 
32 namespace {
33 
34 /// Helper to keep internal information during pipelining transformation.
35 struct LoopPipelinerInternal {
36  /// Coarse liverange information for ops used across stages.
37  struct LiverangeInfo {
38  unsigned lastUseStage = 0;
39  unsigned defStage = 0;
40  };
41 
42 protected:
43  ForOp forOp;
44  unsigned maxStage = 0;
46  std::vector<Operation *> opOrder;
47  Value ub;
48  Value lb;
49  Value step;
50  bool dynamicLoop;
51  PipeliningOption::AnnotationlFnType annotateFn = nullptr;
52  bool peelEpilogue;
53  PipeliningOption::PredicateOpFn predicateFn = nullptr;
54 
55  // When peeling the kernel we generate several version of each value for
56  // different stage of the prologue. This map tracks the mapping between
57  // original Values in the loop and the different versions
58  // peeled from the loop.
60 
61  /// Assign a value to `valueMapping`, this means `val` represents the version
62  /// `idx` of `key` in the epilogue.
63  void setValueMapping(Value key, Value el, int64_t idx);
64 
65  /// Return the defining op of the given value, if the Value is an argument of
66  /// the loop return the associated defining op in the loop and its distance to
67  /// the Value.
68  std::pair<Operation *, int64_t> getDefiningOpAndDistance(Value value);
69 
70  /// Return true if the schedule is possible and return false otherwise. A
71  /// schedule is correct if all definitions are scheduled before uses.
72  bool verifySchedule();
73 
74 public:
75  /// Initalize the information for the given `op`, return true if it
76  /// satisfies the pre-condition to apply pipelining.
77  bool initializeLoopInfo(ForOp op, const PipeliningOption &options);
78  /// Emits the prologue, this creates `maxStage - 1` part which will contain
79  /// operations from stages [0; i], where i is the part index.
80  LogicalResult emitPrologue(RewriterBase &rewriter);
81  /// Gather liverange information for Values that are used in a different stage
82  /// than its definition.
83  llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues();
84  scf::ForOp createKernelLoop(
85  const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
86  RewriterBase &rewriter,
87  llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap);
88  /// Emits the pipelined kernel. This clones loop operations following user
89  /// order and remaps operands defined in a different stage as their use.
90  LogicalResult createKernel(
91  scf::ForOp newForOp,
92  const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
93  const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
94  RewriterBase &rewriter);
95  /// Emits the epilogue, this creates `maxStage - 1` part which will contain
96  /// operations from stages [i; maxStage], where i is the part index.
97  LogicalResult emitEpilogue(RewriterBase &rewriter,
98  llvm::SmallVector<Value> &returnValues);
99 };
100 
101 bool LoopPipelinerInternal::initializeLoopInfo(
102  ForOp op, const PipeliningOption &options) {
103  LDBG("Start initializeLoopInfo");
104  forOp = op;
105  ub = forOp.getUpperBound();
106  lb = forOp.getLowerBound();
107  step = forOp.getStep();
108 
109  dynamicLoop = true;
110  auto upperBoundCst = getConstantIntValue(ub);
111  auto lowerBoundCst = getConstantIntValue(lb);
112  auto stepCst = getConstantIntValue(step);
113  if (!upperBoundCst || !lowerBoundCst || !stepCst) {
114  if (!options.supportDynamicLoops) {
115  LDBG("--dynamic loop not supported -> BAIL");
116  return false;
117  }
118  } else {
119  int64_t ubImm = upperBoundCst.value();
120  int64_t lbImm = lowerBoundCst.value();
121  int64_t stepImm = stepCst.value();
122  if (stepImm <= 0) {
123  LDBG("--invalid loop step -> BAIL");
124  return false;
125  }
126  int64_t numIteration = llvm::divideCeilSigned(ubImm - lbImm, stepImm);
127  if (numIteration > maxStage) {
128  dynamicLoop = false;
129  } else if (!options.supportDynamicLoops) {
130  LDBG("--fewer loop iterations than pipeline stages -> BAIL");
131  return false;
132  }
133  }
134  peelEpilogue = options.peelEpilogue;
135  predicateFn = options.predicateFn;
136  if ((!peelEpilogue || dynamicLoop) && predicateFn == nullptr) {
137  LDBG("--no epilogue or predicate set -> BAIL");
138  return false;
139  }
140  std::vector<std::pair<Operation *, unsigned>> schedule;
141  options.getScheduleFn(forOp, schedule);
142  if (schedule.empty()) {
143  LDBG("--empty schedule -> BAIL");
144  return false;
145  }
146 
147  opOrder.reserve(schedule.size());
148  for (auto &opSchedule : schedule) {
149  maxStage = std::max(maxStage, opSchedule.second);
150  stages[opSchedule.first] = opSchedule.second;
151  opOrder.push_back(opSchedule.first);
152  }
153 
154  // All operations need to have a stage.
155  for (Operation &op : forOp.getBody()->without_terminator()) {
156  if (!stages.contains(&op)) {
157  op.emitOpError("not assigned a pipeline stage");
158  LDBG("--op not assigned a pipeline stage: " << op << " -> BAIL");
159  return false;
160  }
161  }
162 
163  if (!verifySchedule()) {
164  LDBG("--invalid schedule: " << op << " -> BAIL");
165  return false;
166  }
167 
168  // Currently, we do not support assigning stages to ops in nested regions. The
169  // block of all operations assigned a stage should be the single `scf.for`
170  // body block.
171  for (const auto &[op, stageNum] : stages) {
172  (void)stageNum;
173  if (op == forOp.getBody()->getTerminator()) {
174  op->emitError("terminator should not be assigned a stage");
175  LDBG("--terminator should not be assigned stage: " << *op << " -> BAIL");
176  return false;
177  }
178  if (op->getBlock() != forOp.getBody()) {
179  op->emitOpError("the owning Block of all operations assigned a stage "
180  "should be the loop body block");
181  LDBG("--the owning Block of all operations assigned a stage "
182  "should be the loop body block: "
183  << *op << " -> BAIL");
184  return false;
185  }
186  }
187 
188  // Support only loop-carried dependencies with a distance of one iteration or
189  // those defined outside of the loop. This means that any dependency within a
190  // loop should either be on the immediately preceding iteration, the current
191  // iteration, or on variables whose values are set before entering the loop.
192  if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(),
193  [this](Value operand) {
194  Operation *def = operand.getDefiningOp();
195  return !def ||
196  (!stages.contains(def) && forOp->isAncestor(def));
197  })) {
198  LDBG("--only support loop carried dependency with a distance of 1 or "
199  "defined outside of the loop -> BAIL");
200  return false;
201  }
202  annotateFn = options.annotateFn;
203  return true;
204 }
205 
206 /// Find operands of all the nested operations within `op`.
207 static SetVector<Value> getNestedOperands(Operation *op) {
208  SetVector<Value> operands;
209  op->walk([&](Operation *nestedOp) {
210  operands.insert_range(nestedOp->getOperands());
211  });
212  return operands;
213 }
214 
215 /// Compute unrolled cycles of each op (consumer) and verify that each op is
216 /// scheduled after its operands (producers) while adjusting for the distance
217 /// between producer and consumer.
218 bool LoopPipelinerInternal::verifySchedule() {
219  int64_t numCylesPerIter = opOrder.size();
220  // Pre-compute the unrolled cycle of each op.
221  DenseMap<Operation *, int64_t> unrolledCyles;
222  for (int64_t cycle = 0; cycle < numCylesPerIter; cycle++) {
223  Operation *def = opOrder[cycle];
224  auto it = stages.find(def);
225  assert(it != stages.end());
226  int64_t stage = it->second;
227  unrolledCyles[def] = cycle + stage * numCylesPerIter;
228  }
229  for (Operation *consumer : opOrder) {
230  int64_t consumerCycle = unrolledCyles[consumer];
231  for (Value operand : getNestedOperands(consumer)) {
232  auto [producer, distance] = getDefiningOpAndDistance(operand);
233  if (!producer)
234  continue;
235  auto it = unrolledCyles.find(producer);
236  // Skip producer coming from outside the loop.
237  if (it == unrolledCyles.end())
238  continue;
239  int64_t producerCycle = it->second;
240  if (consumerCycle < producerCycle - numCylesPerIter * distance) {
241  consumer->emitError("operation scheduled before its operands");
242  return false;
243  }
244  }
245  }
246  return true;
247 }
248 
249 /// Clone `op` and call `callback` on the cloned op's oeprands as well as any
250 /// operands of nested ops that:
251 /// 1) aren't defined within the new op or
252 /// 2) are block arguments.
253 static Operation *
254 cloneAndUpdateOperands(RewriterBase &rewriter, Operation *op,
255  function_ref<void(OpOperand *newOperand)> callback) {
256  Operation *clone = rewriter.clone(*op);
257  clone->walk<WalkOrder::PreOrder>([&](Operation *nested) {
258  // 'clone' itself will be visited first.
259  for (OpOperand &operand : nested->getOpOperands()) {
260  Operation *def = operand.get().getDefiningOp();
261  if ((def && !clone->isAncestor(def)) || isa<BlockArgument>(operand.get()))
262  callback(&operand);
263  }
264  });
265  return clone;
266 }
267 
268 LogicalResult LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
269  // Initialize the iteration argument to the loop initial values.
270  for (auto [arg, operand] :
271  llvm::zip(forOp.getRegionIterArgs(), forOp.getInitsMutable())) {
272  setValueMapping(arg, operand.get(), 0);
273  }
274  auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
275  Location loc = forOp.getLoc();
276  SmallVector<Value> predicates(maxStage);
277  for (int64_t i = 0; i < maxStage; i++) {
278  if (dynamicLoop) {
279  Type t = ub.getType();
280  // pred = ub > lb + (i * step)
281  Value iv = rewriter.create<arith::AddIOp>(
282  loc, lb,
283  rewriter.create<arith::MulIOp>(
284  loc, step,
285  rewriter.create<arith::ConstantOp>(
286  loc, rewriter.getIntegerAttr(t, i))));
287  predicates[i] = rewriter.create<arith::CmpIOp>(
288  loc, arith::CmpIPredicate::slt, iv, ub);
289  }
290 
291  // special handling for induction variable as the increment is implicit.
292  // iv = lb + i * step
293  Type t = lb.getType();
294  Value iv = rewriter.create<arith::AddIOp>(
295  loc, lb,
296  rewriter.create<arith::MulIOp>(
297  loc, step,
298  rewriter.create<arith::ConstantOp>(loc,
299  rewriter.getIntegerAttr(t, i))));
300  setValueMapping(forOp.getInductionVar(), iv, i);
301  for (Operation *op : opOrder) {
302  if (stages[op] > i)
303  continue;
304  Operation *newOp =
305  cloneAndUpdateOperands(rewriter, op, [&](OpOperand *newOperand) {
306  auto it = valueMapping.find(newOperand->get());
307  if (it != valueMapping.end()) {
308  Value replacement = it->second[i - stages[op]];
309  newOperand->set(replacement);
310  }
311  });
312  int predicateIdx = i - stages[op];
313  if (predicates[predicateIdx]) {
314  OpBuilder::InsertionGuard insertGuard(rewriter);
315  newOp = predicateFn(rewriter, newOp, predicates[predicateIdx]);
316  if (newOp == nullptr)
317  return failure();
318  }
319  if (annotateFn)
320  annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i);
321  for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
322  Value source = newOp->getResult(destId);
323  // If the value is a loop carried dependency update the loop argument
324  for (OpOperand &operand : yield->getOpOperands()) {
325  if (operand.get() != op->getResult(destId))
326  continue;
327  if (predicates[predicateIdx] &&
328  !forOp.getResult(operand.getOperandNumber()).use_empty()) {
329  // If the value is used outside the loop, we need to make sure we
330  // return the correct version of it.
331  Value prevValue = valueMapping
332  [forOp.getRegionIterArgs()[operand.getOperandNumber()]]
333  [i - stages[op]];
334  source = rewriter.create<arith::SelectOp>(
335  loc, predicates[predicateIdx], source, prevValue);
336  }
337  setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
338  source, i - stages[op] + 1);
339  }
340  setValueMapping(op->getResult(destId), newOp->getResult(destId),
341  i - stages[op]);
342  }
343  }
344  }
345  return success();
346 }
347 
348 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
349 LoopPipelinerInternal::analyzeCrossStageValues() {
350  llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues;
351  for (Operation *op : opOrder) {
352  unsigned stage = stages[op];
353 
354  auto analyzeOperand = [&](OpOperand &operand) {
355  auto [def, distance] = getDefiningOpAndDistance(operand.get());
356  if (!def)
357  return;
358  auto defStage = stages.find(def);
359  if (defStage == stages.end() || defStage->second == stage ||
360  defStage->second == stage + distance)
361  return;
362  assert(stage > defStage->second);
363  LiverangeInfo &info = crossStageValues[operand.get()];
364  info.defStage = defStage->second;
365  info.lastUseStage = std::max(info.lastUseStage, stage);
366  };
367 
368  for (OpOperand &operand : op->getOpOperands())
369  analyzeOperand(operand);
370  visitUsedValuesDefinedAbove(op->getRegions(), [&](OpOperand *operand) {
371  analyzeOperand(*operand);
372  });
373  }
374  return crossStageValues;
375 }
376 
377 std::pair<Operation *, int64_t>
378 LoopPipelinerInternal::getDefiningOpAndDistance(Value value) {
379  int64_t distance = 0;
380  if (auto arg = dyn_cast<BlockArgument>(value)) {
381  if (arg.getOwner() != forOp.getBody())
382  return {nullptr, 0};
383  // Ignore induction variable.
384  if (arg.getArgNumber() == 0)
385  return {nullptr, 0};
386  distance++;
387  value =
388  forOp.getBody()->getTerminator()->getOperand(arg.getArgNumber() - 1);
389  }
390  Operation *def = value.getDefiningOp();
391  if (!def)
392  return {nullptr, 0};
393  return {def, distance};
394 }
395 
396 scf::ForOp LoopPipelinerInternal::createKernelLoop(
397  const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
398  &crossStageValues,
399  RewriterBase &rewriter,
400  llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) {
401  // Creates the list of initial values associated to values used across
402  // stages. The initial values come from the prologue created above.
403  // Keep track of the kernel argument associated to each version of the
404  // values passed to the kernel.
405  llvm::SmallVector<Value> newLoopArg;
406  // For existing loop argument initialize them with the right version from the
407  // prologue.
408  for (const auto &retVal :
409  llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
410  Operation *def = retVal.value().getDefiningOp();
411  assert(def && "Only support loop carried dependencies of distance of 1 or "
412  "outside the loop");
413  auto defStage = stages.find(def);
414  if (defStage != stages.end()) {
415  Value valueVersion =
416  valueMapping[forOp.getRegionIterArgs()[retVal.index()]]
417  [maxStage - defStage->second];
418  assert(valueVersion);
419  newLoopArg.push_back(valueVersion);
420  } else
421  newLoopArg.push_back(forOp.getInitArgs()[retVal.index()]);
422  }
423  for (auto escape : crossStageValues) {
424  LiverangeInfo &info = escape.second;
425  Value value = escape.first;
426  for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage;
427  stageIdx++) {
428  Value valueVersion =
429  valueMapping[value][maxStage - info.lastUseStage + stageIdx];
430  assert(valueVersion);
431  newLoopArg.push_back(valueVersion);
432  loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage -
433  stageIdx)] = newLoopArg.size() - 1;
434  }
435  }
436 
437  // Create the new kernel loop. When we peel the epilgue we need to peel
438  // `numStages - 1` iterations. Then we adjust the upper bound to remove those
439  // iterations.
440  Value newUb = forOp.getUpperBound();
441  if (peelEpilogue) {
442  Type t = ub.getType();
443  Location loc = forOp.getLoc();
444  // newUb = ub - maxStage * step
445  Value maxStageValue = rewriter.create<arith::ConstantOp>(
446  loc, rewriter.getIntegerAttr(t, maxStage));
447  Value maxStageByStep =
448  rewriter.create<arith::MulIOp>(loc, step, maxStageValue);
449  newUb = rewriter.create<arith::SubIOp>(loc, ub, maxStageByStep);
450  }
451  auto newForOp =
452  rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb,
453  forOp.getStep(), newLoopArg);
454  // When there are no iter args, the loop body terminator will be created.
455  // Since we always create it below, remove the terminator if it was created.
456  if (!newForOp.getBody()->empty())
457  rewriter.eraseOp(newForOp.getBody()->getTerminator());
458  return newForOp;
459 }
460 
461 LogicalResult LoopPipelinerInternal::createKernel(
462  scf::ForOp newForOp,
463  const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
464  &crossStageValues,
465  const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
466  RewriterBase &rewriter) {
467  valueMapping.clear();
468 
469  // Create the kernel, we clone instruction based on the order given by
470  // user and remap operands coming from a previous stages.
471  rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin());
472  IRMapping mapping;
473  mapping.map(forOp.getInductionVar(), newForOp.getInductionVar());
474  for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) {
475  mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]);
476  }
477  SmallVector<Value> predicates(maxStage + 1, nullptr);
478  if (!peelEpilogue) {
479  // Create a predicate for each stage except the last stage.
480  Location loc = newForOp.getLoc();
481  Type t = ub.getType();
482  for (unsigned i = 0; i < maxStage; i++) {
483  // c = ub - (maxStage - i) * step
484  Value c = rewriter.create<arith::SubIOp>(
485  loc, ub,
486  rewriter.create<arith::MulIOp>(
487  loc, step,
488  rewriter.create<arith::ConstantOp>(
489  loc, rewriter.getIntegerAttr(t, int64_t(maxStage - i)))));
490 
491  Value pred = rewriter.create<arith::CmpIOp>(
492  newForOp.getLoc(), arith::CmpIPredicate::slt,
493  newForOp.getInductionVar(), c);
494  predicates[i] = pred;
495  }
496  }
497  for (Operation *op : opOrder) {
498  int64_t useStage = stages[op];
499  auto *newOp = rewriter.clone(*op, mapping);
500  SmallVector<OpOperand *> operands;
501  // Collect all the operands for the cloned op and its nested ops.
502  op->walk([&operands](Operation *nestedOp) {
503  for (OpOperand &operand : nestedOp->getOpOperands()) {
504  operands.push_back(&operand);
505  }
506  });
507  for (OpOperand *operand : operands) {
508  Operation *nestedNewOp = mapping.lookup(operand->getOwner());
509  // Special case for the induction variable uses. We replace it with a
510  // version incremented based on the stage where it is used.
511  if (operand->get() == forOp.getInductionVar()) {
512  rewriter.setInsertionPoint(newOp);
513 
514  // offset = (maxStage - stages[op]) * step
515  Type t = step.getType();
516  Value offset = rewriter.create<arith::MulIOp>(
517  forOp.getLoc(), step,
518  rewriter.create<arith::ConstantOp>(
519  forOp.getLoc(),
520  rewriter.getIntegerAttr(t, maxStage - stages[op])));
521  Value iv = rewriter.create<arith::AddIOp>(
522  forOp.getLoc(), newForOp.getInductionVar(), offset);
523  nestedNewOp->setOperand(operand->getOperandNumber(), iv);
524  rewriter.setInsertionPointAfter(newOp);
525  continue;
526  }
527  Value source = operand->get();
528  auto arg = dyn_cast<BlockArgument>(source);
529  if (arg && arg.getOwner() == forOp.getBody()) {
530  Value ret = forOp.getBody()->getTerminator()->getOperand(
531  arg.getArgNumber() - 1);
532  Operation *dep = ret.getDefiningOp();
533  if (!dep)
534  continue;
535  auto stageDep = stages.find(dep);
536  if (stageDep == stages.end() || stageDep->second == useStage)
537  continue;
538  // If the value is a loop carried value coming from stage N + 1 remap,
539  // it will become a direct use.
540  if (stageDep->second == useStage + 1) {
541  nestedNewOp->setOperand(operand->getOperandNumber(),
542  mapping.lookupOrDefault(ret));
543  continue;
544  }
545  source = ret;
546  }
547  // For operands defined in a previous stage we need to remap it to use
548  // the correct region argument. We look for the right version of the
549  // Value based on the stage where it is used.
550  Operation *def = source.getDefiningOp();
551  if (!def)
552  continue;
553  auto stageDef = stages.find(def);
554  if (stageDef == stages.end() || stageDef->second == useStage)
555  continue;
556  auto remap = loopArgMap.find(
557  std::make_pair(operand->get(), useStage - stageDef->second));
558  assert(remap != loopArgMap.end());
559  nestedNewOp->setOperand(operand->getOperandNumber(),
560  newForOp.getRegionIterArgs()[remap->second]);
561  }
562 
563  if (predicates[useStage]) {
564  OpBuilder::InsertionGuard insertGuard(rewriter);
565  newOp = predicateFn(rewriter, newOp, predicates[useStage]);
566  if (!newOp)
567  return failure();
568  // Remap the results to the new predicated one.
569  for (auto values : llvm::zip(op->getResults(), newOp->getResults()))
570  mapping.map(std::get<0>(values), std::get<1>(values));
571  }
572  if (annotateFn)
573  annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0);
574  }
575 
576  // Collect the Values that need to be returned by the forOp. For each
577  // value we need to have `LastUseStage - DefStage` number of versions
578  // returned.
579  // We create a mapping between original values and the associated loop
580  // returned values that will be needed by the epilogue.
581  llvm::SmallVector<Value> yieldOperands;
582  for (OpOperand &yieldOperand :
583  forOp.getBody()->getTerminator()->getOpOperands()) {
584  Value source = mapping.lookupOrDefault(yieldOperand.get());
585  // When we don't peel the epilogue and the yield value is used outside the
586  // loop we need to make sure we return the version from numStages -
587  // defStage.
588  if (!peelEpilogue &&
589  !forOp.getResult(yieldOperand.getOperandNumber()).use_empty()) {
590  Operation *def = getDefiningOpAndDistance(yieldOperand.get()).first;
591  if (def) {
592  auto defStage = stages.find(def);
593  if (defStage != stages.end() && defStage->second < maxStage) {
594  Value pred = predicates[defStage->second];
595  source = rewriter.create<arith::SelectOp>(
596  pred.getLoc(), pred, source,
597  newForOp.getBody()
598  ->getArguments()[yieldOperand.getOperandNumber() + 1]);
599  }
600  }
601  }
602  yieldOperands.push_back(source);
603  }
604 
605  for (auto &it : crossStageValues) {
606  int64_t version = maxStage - it.second.lastUseStage + 1;
607  unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage;
608  // add the original version to yield ops.
609  // If there is a live range spanning across more than 2 stages we need to
610  // add extra arg.
611  for (unsigned i = 1; i < numVersionReturned; i++) {
612  setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
613  version++);
614  yieldOperands.push_back(
615  newForOp.getBody()->getArguments()[yieldOperands.size() + 1 +
616  newForOp.getNumInductionVars()]);
617  }
618  setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
619  version++);
620  yieldOperands.push_back(mapping.lookupOrDefault(it.first));
621  }
622  // Map the yield operand to the forOp returned value.
623  for (const auto &retVal :
624  llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
625  Operation *def = retVal.value().getDefiningOp();
626  assert(def && "Only support loop carried dependencies of distance of 1 or "
627  "defined outside the loop");
628  auto defStage = stages.find(def);
629  if (defStage == stages.end()) {
630  for (unsigned int stage = 1; stage <= maxStage; stage++)
631  setValueMapping(forOp.getRegionIterArgs()[retVal.index()],
632  retVal.value(), stage);
633  } else if (defStage->second > 0) {
634  setValueMapping(forOp.getRegionIterArgs()[retVal.index()],
635  newForOp->getResult(retVal.index()),
636  maxStage - defStage->second + 1);
637  }
638  }
639  rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands);
640  return success();
641 }
642 
643 LogicalResult
644 LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter,
645  llvm::SmallVector<Value> &returnValues) {
646  Location loc = forOp.getLoc();
647  Type t = lb.getType();
648 
649  // Emit different versions of the induction variable. They will be
650  // removed by dead code if not used.
651 
652  auto createConst = [&](int v) {
653  return rewriter.create<arith::ConstantOp>(loc,
654  rewriter.getIntegerAttr(t, v));
655  };
656 
657  // total_iterations = cdiv(range_diff, step);
658  // - range_diff = ub - lb
659  // - total_iterations = (range_diff + step + (step < 0 ? 1 : -1)) / step
660  Value zero = createConst(0);
661  Value one = createConst(1);
662  Value stepLessZero = rewriter.create<arith::CmpIOp>(
663  loc, arith::CmpIPredicate::slt, step, zero);
664  Value stepDecr =
665  rewriter.create<arith::SelectOp>(loc, stepLessZero, one, createConst(-1));
666 
667  Value rangeDiff = rewriter.create<arith::SubIOp>(loc, ub, lb);
668  Value rangeIncrStep = rewriter.create<arith::AddIOp>(loc, rangeDiff, step);
669  Value rangeDecr =
670  rewriter.create<arith::AddIOp>(loc, rangeIncrStep, stepDecr);
671  Value totalIterations = rewriter.create<arith::DivSIOp>(loc, rangeDecr, step);
672 
673  // If total_iters < max_stage, start the epilogue at zero to match the
674  // ramp-up in the prologue.
675  // start_iter = max(0, total_iters - max_stage)
676  Value iterI = rewriter.create<arith::SubIOp>(loc, totalIterations,
677  createConst(maxStage));
678  iterI = rewriter.create<arith::MaxSIOp>(loc, zero, iterI);
679 
680  // Capture predicates for dynamic loops.
681  SmallVector<Value> predicates(maxStage + 1);
682 
683  for (int64_t i = 1; i <= maxStage; i++) {
684  // newLastIter = lb + step * iterI
685  Value newlastIter = rewriter.create<arith::AddIOp>(
686  loc, lb, rewriter.create<arith::MulIOp>(loc, step, iterI));
687 
688  setValueMapping(forOp.getInductionVar(), newlastIter, i);
689 
690  // increment to next iterI
691  iterI = rewriter.create<arith::AddIOp>(loc, iterI, one);
692 
693  if (dynamicLoop) {
694  // Disable stages when `i` is greater than total_iters.
695  // pred = total_iters >= i
696  predicates[i] = rewriter.create<arith::CmpIOp>(
697  loc, arith::CmpIPredicate::sge, totalIterations, createConst(i));
698  }
699  }
700 
701  // Emit `maxStage - 1` epilogue part that includes operations from stages
702  // [i; maxStage].
703  for (int64_t i = 1; i <= maxStage; i++) {
704  SmallVector<std::pair<Value, unsigned>> returnMap(returnValues.size());
705  for (Operation *op : opOrder) {
706  if (stages[op] < i)
707  continue;
708  unsigned currentVersion = maxStage - stages[op] + i;
709  unsigned nextVersion = currentVersion + 1;
710  Operation *newOp =
711  cloneAndUpdateOperands(rewriter, op, [&](OpOperand *newOperand) {
712  auto it = valueMapping.find(newOperand->get());
713  if (it != valueMapping.end()) {
714  Value replacement = it->second[currentVersion];
715  newOperand->set(replacement);
716  }
717  });
718  if (dynamicLoop) {
719  OpBuilder::InsertionGuard insertGuard(rewriter);
720  newOp = predicateFn(rewriter, newOp, predicates[currentVersion]);
721  if (!newOp)
722  return failure();
723  }
724  if (annotateFn)
725  annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1);
726 
727  for (auto [opRes, newRes] :
728  llvm::zip(op->getResults(), newOp->getResults())) {
729  setValueMapping(opRes, newRes, currentVersion);
730  // If the value is a loop carried dependency update the loop argument
731  // mapping and keep track of the last version to replace the original
732  // forOp uses.
733  for (OpOperand &operand :
734  forOp.getBody()->getTerminator()->getOpOperands()) {
735  if (operand.get() != opRes)
736  continue;
737  // If the version is greater than maxStage it means it maps to the
738  // original forOp returned value.
739  unsigned ri = operand.getOperandNumber();
740  returnValues[ri] = newRes;
741  Value mapVal = forOp.getRegionIterArgs()[ri];
742  returnMap[ri] = std::make_pair(mapVal, currentVersion);
743  if (nextVersion <= maxStage)
744  setValueMapping(mapVal, newRes, nextVersion);
745  }
746  }
747  }
748  if (dynamicLoop) {
749  // Select return values from this stage (live outs) based on predication.
750  // If the stage is valid select the peeled value, else use previous stage
751  // value.
752  for (auto pair : llvm::enumerate(returnValues)) {
753  unsigned ri = pair.index();
754  auto [mapVal, currentVersion] = returnMap[ri];
755  if (mapVal) {
756  unsigned nextVersion = currentVersion + 1;
757  Value pred = predicates[currentVersion];
758  Value prevValue = valueMapping[mapVal][currentVersion];
759  auto selOp = rewriter.create<arith::SelectOp>(loc, pred, pair.value(),
760  prevValue);
761  returnValues[ri] = selOp;
762  if (nextVersion <= maxStage)
763  setValueMapping(mapVal, selOp, nextVersion);
764  }
765  }
766  }
767  }
768  return success();
769 }
770 
771 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) {
772  auto it = valueMapping.find(key);
773  // If the value is not in the map yet add a vector big enough to store all
774  // versions.
775  if (it == valueMapping.end())
776  it =
777  valueMapping
778  .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1)))
779  .first;
780  it->second[idx] = el;
781 }
782 
783 } // namespace
784 
785 FailureOr<ForOp> mlir::scf::pipelineForLoop(RewriterBase &rewriter, ForOp forOp,
786  const PipeliningOption &options,
787  bool *modifiedIR) {
788  if (modifiedIR)
789  *modifiedIR = false;
790  LoopPipelinerInternal pipeliner;
791  if (!pipeliner.initializeLoopInfo(forOp, options))
792  return failure();
793 
794  if (modifiedIR)
795  *modifiedIR = true;
796 
797  // 1. Emit prologue.
798  if (failed(pipeliner.emitPrologue(rewriter)))
799  return failure();
800 
801  // 2. Track values used across stages. When a value cross stages it will
802  // need to be passed as loop iteration arguments.
803  // We first collect the values that are used in a different stage than where
804  // they are defined.
805  llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
806  crossStageValues = pipeliner.analyzeCrossStageValues();
807 
808  // Mapping between original loop values used cross stage and the block
809  // arguments associated after pipelining. A Value may map to several
810  // arguments if its liverange spans across more than 2 stages.
811  llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap;
812  // 3. Create the new kernel loop and return the block arguments mapping.
813  ForOp newForOp =
814  pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap);
815  // Create the kernel block, order ops based on user choice and remap
816  // operands.
817  if (failed(pipeliner.createKernel(newForOp, crossStageValues, loopArgMap,
818  rewriter)))
819  return failure();
820 
821  llvm::SmallVector<Value> returnValues =
822  newForOp.getResults().take_front(forOp->getNumResults());
823  if (options.peelEpilogue) {
824  // 4. Emit the epilogue after the new forOp.
825  rewriter.setInsertionPointAfter(newForOp);
826  if (failed(pipeliner.emitEpilogue(rewriter, returnValues)))
827  return failure();
828  }
829  // 5. Erase the original loop and replace the uses with the epilogue output.
830  if (forOp->getNumResults() > 0)
831  rewriter.replaceOp(forOp, returnValues);
832  else
833  rewriter.eraseOp(forOp);
834 
835  return newForOp;
836 }
837 
841 }
static Value createConst(Location loc, Type type, int value, PatternRewriter &rewriter)
Create an integer or index constant.
Definition: ExpandOps.cpp:27
#define LDBG(X)
static llvm::ManagedStatic< PassManagerOptions > options
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
IntegerAttr getIntegerAttr(Type type, int64_t value)
Definition: Builders.cpp:224
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:65
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:160
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:346
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:549
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:396
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:453
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410
This class represents an operand of an operation.
Definition: Value.h:243
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
void setOperand(unsigned idx, Value value)
Definition: Operation.h:351
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:407
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:798
MutableArrayRef< OpOperand > getOpOperands()
Definition: Operation.h:383
operand_range getOperands()
Returns an iterator on the underlying Value's.
Definition: Operation.h:378
bool isAncestor(Operation *other)
Return true if this operation is an ancestor of the other operation.
Definition: Operation.h:263
result_range getResults()
Definition: Operation.h:415
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:362
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
void populateSCFLoopPipeliningPatterns(RewritePatternSet &patterns, const PipeliningOption &options)
Populate patterns for SCF software pipelining transformation.
FailureOr< ForOp > pipelineForLoop(RewriterBase &rewriter, ForOp forOp, const PipeliningOption &options, bool *modifiedIR=nullptr)
Generate a pipelined version of the scf.for loop based on the schedule given as option.
Include the generated interface declarations.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
const FrozenRewritePatternSet & patterns
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
void visitUsedValuesDefinedAbove(Region &region, Region &limit, function_ref< void(OpOperand *)> callback)
Calls callback for each use of a value within region or its descendants that was defined at the ances...
Definition: RegionUtils.cpp:43
Options to dictate how loops should be pipelined.
Definition: Transforms.h:123
std::function< void(Operation *, PipelinerPart, unsigned)> AnnotationlFnType
Lambda called by the pipeliner to allow the user to annotate the IR while it is generated.
Definition: Transforms.h:141
std::function< Operation *(RewriterBase &, Operation *, Value)> PredicateOpFn
Definition: Transforms.h:164