25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SetVector.h"
27 #include "llvm/ADT/SmallPtrSet.h"
28 #include "llvm/ADT/SmallVector.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
35 #define DEBUG_TYPE "scf-utils"
36 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
37 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
42 bool replaceIterOperandsUsesInLoop) {
48 assert(loopNest.size() <= 10 &&
49 "exceeded recursion limit when yielding value from loop nest");
81 if (loopNest.size() == 1) {
83 cast<scf::ForOp>(*loopNest.back().replaceWithAdditionalYields(
84 rewriter, newIterOperands, replaceIterOperandsUsesInLoop,
86 return {innerMostLoop};
96 innerNewBBArgs, newYieldValuesFn,
97 replaceIterOperandsUsesInLoop);
98 return llvm::to_vector(llvm::map_range(
99 newLoopNest.front().getResults().take_back(innerNewBBArgs.size()),
102 scf::ForOp outerMostLoop =
103 cast<scf::ForOp>(*loopNest.front().replaceWithAdditionalYields(
104 rewriter, newIterOperands, replaceIterOperandsUsesInLoop, fn));
105 newLoopNest.insert(newLoopNest.begin(), outerMostLoop);
122 func::CallOp *callOp) {
123 assert(!funcName.empty() &&
"funcName cannot be empty");
137 ValueRange outlinedValues(captures.getArrayRef());
144 outlinedFuncArgTypes.push_back(arg.getType());
145 outlinedFuncArgLocs.push_back(arg.getLoc());
147 for (
Value value : outlinedValues) {
148 outlinedFuncArgTypes.push_back(value.getType());
149 outlinedFuncArgLocs.push_back(value.getLoc());
151 FunctionType outlinedFuncType =
155 rewriter.
create<func::FuncOp>(loc, funcName, outlinedFuncType);
156 Block *outlinedFuncBody = outlinedFunc.addEntryBlock();
161 auto outlinedFuncBlockArgs = outlinedFuncBody->getArguments();
166 originalBlock, outlinedFuncBody,
167 outlinedFuncBlockArgs.take_front(numOriginalBlockArguments));
177 ®ion, region.
begin(),
178 TypeRange{outlinedFuncArgTypes}.take_front(numOriginalBlockArguments),
180 .take_front(numOriginalBlockArguments));
185 llvm::append_range(callValues, newBlock->
getArguments());
186 llvm::append_range(callValues, outlinedValues);
187 auto call = rewriter.
create<func::CallOp>(loc, outlinedFunc, callValues);
196 rewriter.
clone(*originalTerminator, bvm);
197 rewriter.
eraseOp(originalTerminator);
202 for (
auto it : llvm::zip(outlinedValues, outlinedFuncBlockArgs.take_back(
203 outlinedValues.size()))) {
204 Value orig = std::get<0>(it);
205 Value repl = std::get<1>(it);
215 return outlinedFunc->isProperAncestor(opOperand.
getOwner());
223 func::FuncOp *thenFn, StringRef thenFnName,
224 func::FuncOp *elseFn, StringRef elseFnName) {
227 FailureOr<func::FuncOp> outlinedFuncOpOrFailure;
228 if (thenFn && !ifOp.getThenRegion().empty()) {
230 rewriter, loc, ifOp.getThenRegion(), thenFnName);
231 if (failed(outlinedFuncOpOrFailure))
233 *thenFn = *outlinedFuncOpOrFailure;
235 if (elseFn && !ifOp.getElseRegion().empty()) {
237 rewriter, loc, ifOp.getElseRegion(), elseFnName);
238 if (failed(outlinedFuncOpOrFailure))
240 *elseFn = *outlinedFuncOpOrFailure;
247 assert(rootOp !=
nullptr &&
"Root operation must not be a nullptr.");
248 bool rootEnclosesPloops =
false;
250 for (
Block &block : region.getBlocks()) {
253 rootEnclosesPloops |= enclosesPloops;
254 if (
auto ploop = dyn_cast<scf::ParallelOp>(op)) {
255 rootEnclosesPloops =
true;
259 result.push_back(ploop);
264 return rootEnclosesPloops;
272 assert(divisor > 0 &&
"expected positive divisor");
273 assert(dividend.
getType().
isIndex() &&
"expected index-typed value");
275 Value divisorMinusOneCst =
276 builder.
create<arith::ConstantIndexOp>(loc, divisor - 1);
277 Value divisorCst = builder.
create<arith::ConstantIndexOp>(loc, divisor);
278 Value sum = builder.
create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
279 return builder.
create<arith::DivUIOp>(loc, sum, divisorCst);
288 assert(dividend.
getType().
isIndex() &&
"expected index-typed value");
290 Value cstOne = builder.
create<arith::ConstantIndexOp>(loc, 1);
291 Value divisorMinusOne = builder.
create<arith::SubIOp>(loc, divisor, cstOne);
292 Value sum = builder.
create<arith::AddIOp>(loc, dividend, divisorMinusOne);
293 return builder.
create<arith::DivUIOp>(loc, sum, divisor);
303 if (!lbCstOp.has_value() || !ubCstOp.has_value() || !stepCstOp.has_value())
307 int64_t lbCst = lbCstOp.value();
308 int64_t ubCst = ubCstOp.value();
309 int64_t stepCst = stepCstOp.value();
310 assert(lbCst >= 0 && ubCst >= 0 && stepCst > 0 &&
311 "expected positive loop bounds and step");
312 return llvm::divideCeilSigned(ubCst - lbCst, stepCst);
320 Block *loopBodyBlock,
Value forOpIV, uint64_t unrollFactor,
338 for (
unsigned i = 1; i < unrollFactor; i++) {
342 operandMap.
map(iterArgs, lastYielded);
347 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
348 operandMap.
map(forOpIV, ivUnroll);
352 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++) {
354 annotateFn(i, clonedOp, builder);
358 for (
unsigned i = 0, e = lastYielded.size(); i < e; i++)
359 lastYielded[i] = operandMap.
lookup(yieldedValues[i]);
364 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++)
365 annotateFn(0, &*it, builder);
373 scf::ForOp forOp, uint64_t unrollFactor,
375 assert(unrollFactor > 0 &&
"expected positive unroll factor");
378 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
385 auto loc = forOp.getLoc();
386 Value step = forOp.getStep();
387 Value upperBoundUnrolled;
389 bool generateEpilogueLoop =
true;
392 if (constTripCount) {
397 if (unrollFactor == 1) {
398 if (*constTripCount == 1 &&
399 failed(forOp.promoteIfSingleIteration(rewriter)))
404 int64_t tripCountEvenMultiple =
405 *constTripCount - (*constTripCount % unrollFactor);
406 int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
407 int64_t stepUnrolledCst = stepCst * unrollFactor;
410 generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
411 if (generateEpilogueLoop)
413 loc, upperBoundUnrolledCst);
415 upperBoundUnrolled = forOp.getUpperBound();
418 stepUnrolled = stepCst == stepUnrolledCst
421 loc, stepUnrolledCst);
426 auto lowerBound = forOp.getLowerBound();
427 auto upperBound = forOp.getUpperBound();
429 boundsBuilder.
create<arith::SubIOp>(loc, upperBound, lowerBound);
431 Value unrollFactorCst =
434 boundsBuilder.
create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
436 Value tripCountEvenMultiple =
437 boundsBuilder.
create<arith::SubIOp>(loc, tripCount, tripCountRem);
439 upperBoundUnrolled = boundsBuilder.
create<arith::AddIOp>(
441 boundsBuilder.
create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
444 boundsBuilder.
create<arith::MulIOp>(loc, step, unrollFactorCst);
448 if (generateEpilogueLoop) {
449 OpBuilder epilogueBuilder(forOp->getContext());
452 auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.
clone(*forOp));
453 epilogueForOp.setLowerBound(upperBoundUnrolled);
456 auto results = forOp.getResults();
457 auto epilogueResults = epilogueForOp.getResults();
459 for (
auto e : llvm::zip(results, epilogueResults)) {
460 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
462 epilogueForOp->setOperands(epilogueForOp.getNumControlOperands(),
463 epilogueForOp.getInitArgs().size(), results);
464 (void)epilogueForOp.promoteIfSingleIteration(rewriter);
468 forOp.setUpperBound(upperBoundUnrolled);
469 forOp.setStep(stepUnrolled);
471 auto iterArgs =
ValueRange(forOp.getRegionIterArgs());
472 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
475 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
478 auto stride = b.create<arith::MulIOp>(
479 loc, step, b.create<arith::ConstantIndexOp>(loc, i));
480 return b.create<arith::AddIOp>(loc, iv, stride);
482 annotateFn, iterArgs, yieldedValues);
484 (void)forOp.promoteIfSingleIteration(rewriter);
491 auto walkResult = forOp.walk([&](scf::ForOp innerForOp) {
492 if (!forOp.isDefinedOutsideOfLoop(innerForOp.getLowerBound()) ||
493 !forOp.isDefinedOutsideOfLoop(innerForOp.getUpperBound()) ||
494 !forOp.isDefinedOutsideOfLoop(innerForOp.getStep()))
499 return !walkResult.wasInterrupted();
504 uint64_t unrollJamFactor) {
505 assert(unrollJamFactor > 0 &&
"unroll jam factor should be positive");
507 if (unrollJamFactor == 1)
513 LDBG(
"failed to unroll and jam: inner bounds are not invariant");
518 if (forOp->getNumResults() > 0) {
519 LDBG(
"failed to unroll and jam: unsupported loop with results");
526 if (!tripCount.has_value()) {
528 LDBG(
"failed to unroll and jam: trip count could not be determined");
531 if (unrollJamFactor > *tripCount) {
532 LDBG(
"unroll and jam factor is greater than trip count, set factor to trip "
534 unrollJamFactor = *tripCount;
535 }
else if (*tripCount % unrollJamFactor != 0) {
536 LDBG(
"failed to unroll and jam: unsupported trip count that is not a "
537 "multiple of unroll jam factor");
542 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
552 forOp.walk([&](scf::ForOp innerForOp) { innerLoops.push_back(innerForOp); });
563 for (scf::ForOp oldForOp : innerLoops) {
565 ValueRange oldIterOperands = oldForOp.getInits();
566 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
568 cast<scf::YieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
571 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
572 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
573 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
577 bool forOpReplaced = oldForOp == forOp;
578 scf::ForOp newForOp =
579 cast<scf::ForOp>(*oldForOp.replaceWithAdditionalYields(
580 rewriter, dupIterOperands,
false,
582 return dupYieldOperands;
584 newInnerLoops.push_back(newForOp);
589 ValueRange newIterArgs = newForOp.getRegionIterArgs();
590 unsigned oldNumIterArgs = oldIterArgs.size();
591 ValueRange newResults = newForOp.getResults();
592 unsigned oldNumResults = newResults.size() / unrollJamFactor;
593 assert(oldNumIterArgs == oldNumResults &&
594 "oldNumIterArgs must be the same as oldNumResults");
595 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
596 for (
unsigned j = 0;
j < oldNumIterArgs; ++
j) {
600 operandMaps[i - 1].map(newIterArgs[
j],
601 newIterArgs[i * oldNumIterArgs +
j]);
602 operandMaps[i - 1].map(newResults[
j],
603 newResults[i * oldNumResults +
j]);
610 int64_t step = forOp.getConstantStep()->getSExtValue();
612 forOp.getLoc(), forOp.getStep(),
614 forOp.getLoc(), rewriter.
getIndexAttr(unrollJamFactor)));
615 forOp.setStep(newStep);
616 auto forOpIV = forOp.getInductionVar();
619 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
620 for (
auto &subBlock : subBlocks) {
623 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
632 builder.
createOrFold<arith::AddIOp>(forOp.getLoc(), forOpIV, ivTag);
633 operandMaps[i - 1].map(forOpIV, ivUnroll);
636 for (
auto it = subBlock.first; it != std::next(subBlock.second); ++it)
637 builder.
clone(*it, operandMaps[i - 1]);
640 for (
auto newForOp : newInnerLoops) {
641 unsigned oldNumIterOperands =
642 newForOp.getNumRegionIterArgs() / unrollJamFactor;
643 unsigned numControlOperands = newForOp.getNumControlOperands();
644 auto yieldOp = cast<scf::YieldOp>(newForOp.getBody()->getTerminator());
645 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
646 assert(oldNumIterOperands == oldNumYieldOperands &&
647 "oldNumIterOperands must be the same as oldNumYieldOperands");
648 for (
unsigned j = 0;
j < oldNumIterOperands; ++
j) {
652 newForOp.setOperand(numControlOperands + i * oldNumIterOperands +
j,
653 operandMaps[i - 1].lookupOrDefault(
654 newForOp.getOperand(numControlOperands +
j)));
656 i * oldNumYieldOperands +
j,
657 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(
j)));
663 (void)forOp.promoteIfSingleIteration(rewriter);
673 bool isZeroBased =
false;
675 isZeroBased = lbCst.value() == 0;
677 bool isStepOne =
false;
679 isStepOne = stepCst.value() == 1;
683 "expected matching types");
688 if (isZeroBased && isStepOne)
689 return {lb, ub, step};
699 newUpperBound = rewriter.
createOrFold<arith::CeilDivSIOp>(
707 return {newLowerBound, newUpperBound, newStep};
713 Value denormalizedIv;
718 Value scaled = normalizedIv;
720 Value origStepValue =
722 scaled = rewriter.
create<arith::MulIOp>(loc, normalizedIv, origStepValue);
725 denormalizedIv = scaled;
728 denormalizedIv = rewriter.
create<arith::AddIOp>(loc, scaled, origLbValue);
738 assert(!values.empty() &&
"unexpected empty list");
739 std::optional<Value> productOf;
740 for (
auto v : values) {
742 if (vOne && vOne.value() == 1)
746 rewriter.
create<arith::MulIOp>(loc, productOf.value(), v).getResult();
752 .
create<arith::ConstantOp>(
753 loc, rewriter.
getOneAttr(values.front().getType()))
756 return productOf.value();
773 llvm::BitVector isUbOne(ubs.size());
776 if (ubCst && ubCst.value() == 1)
781 unsigned numLeadingOneUbs = 0;
783 if (!isUbOne.test(index)) {
786 delinearizedIvs[index] = rewriter.
create<arith::ConstantOp>(
791 Value previous = linearizedIv;
792 for (
unsigned i = numLeadingOneUbs, e = ubs.size(); i < e; ++i) {
793 unsigned idx = ubs.size() - (i - numLeadingOneUbs) - 1;
794 if (i != numLeadingOneUbs && !isUbOne.test(idx + 1)) {
795 previous = rewriter.
create<arith::DivSIOp>(loc, previous, ubs[idx + 1]);
796 preservedUsers.insert(previous.getDefiningOp());
800 if (!isUbOne.test(idx)) {
801 iv = rewriter.
create<arith::RemSIOp>(loc, previous, ubs[idx]);
804 iv = rewriter.
create<arith::ConstantOp>(
808 delinearizedIvs[idx] = iv;
810 return {delinearizedIvs, preservedUsers};
815 if (loops.size() < 2)
818 scf::ForOp innermost = loops.back();
819 scf::ForOp outermost = loops.front();
823 for (
auto loop : loops) {
826 Value lb = loop.getLowerBound();
827 Value ub = loop.getUpperBound();
828 Value step = loop.getStep();
834 newLoopRange.offset));
838 newLoopRange.stride));
842 loop.getInductionVar(), lb, step);
851 loops, [](
auto loop) {
return loop.getUpperBound(); });
853 outermost.setUpperBound(upperBound);
857 rewriter, loc, outermost.getInductionVar(), upperBounds);
861 for (
int i = loops.size() - 1; i > 0; --i) {
862 auto outerLoop = loops[i - 1];
863 auto innerLoop = loops[i];
865 Operation *innerTerminator = innerLoop.getBody()->getTerminator();
866 auto yieldedVals = llvm::to_vector(innerTerminator->
getOperands());
867 rewriter.
eraseOp(innerTerminator);
870 innerBlockArgs.push_back(delinearizeIvs[i]);
871 llvm::append_range(innerBlockArgs, outerLoop.getRegionIterArgs());
874 rewriter.
replaceOp(innerLoop, yieldedVals);
883 IRRewriter rewriter(loops.front().getContext());
888 LogicalResult result(failure());
898 for (
unsigned i = 0, e = loops.size(); i < e; ++i) {
899 operandsDefinedAbove[i] = i;
900 for (
unsigned j = 0;
j < i; ++
j) {
902 loops[i].getUpperBound(),
905 operandsDefinedAbove[i] =
j;
916 iterArgChainStart[0] = 0;
917 for (
unsigned i = 1, e = loops.size(); i < e; ++i) {
919 iterArgChainStart[i] = i;
920 auto outerloop = loops[i - 1];
921 auto innerLoop = loops[i];
922 if (outerloop.getNumRegionIterArgs() != innerLoop.getNumRegionIterArgs()) {
925 if (!llvm::equal(outerloop.getRegionIterArgs(), innerLoop.getInitArgs())) {
928 auto outerloopTerminator = outerloop.getBody()->getTerminator();
929 if (!llvm::equal(outerloopTerminator->getOperands(),
930 innerLoop.getResults())) {
933 iterArgChainStart[i] = iterArgChainStart[i - 1];
939 for (
unsigned end = loops.size(); end > 0; --end) {
941 for (; start < end - 1; ++start) {
943 *std::max_element(std::next(operandsDefinedAbove.begin(), start),
944 std::next(operandsDefinedAbove.begin(), end));
947 if (iterArgChainStart[end - 1] > start)
956 if (start != end - 1)
964 ArrayRef<std::vector<unsigned>> combinedDimensions) {
970 auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
971 for (
auto &dims : sortedDimensions)
976 for (
unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
979 Value lb = loops.getLowerBound()[i];
980 Value ub = loops.getUpperBound()[i];
981 Value step = loops.getStep()[i];
984 rewriter, loops.getLoc(), newLoopRange.size));
995 for (
auto &sortedDimension : sortedDimensions) {
997 for (
auto idx : sortedDimension) {
998 newUpperBound = rewriter.
create<arith::MulIOp>(
999 loc, newUpperBound, normalizedUpperBounds[idx]);
1001 lowerBounds.push_back(cst0);
1002 steps.push_back(cst1);
1003 upperBounds.push_back(newUpperBound);
1012 auto newPloop = rewriter.
create<scf::ParallelOp>(
1013 loc, lowerBounds, upperBounds, steps,
1015 for (
unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
1016 Value previous = ploopIVs[i];
1017 unsigned numberCombinedDimensions = combinedDimensions[i].size();
1019 for (
unsigned j = numberCombinedDimensions - 1;
j > 0; --
j) {
1020 unsigned idx = combinedDimensions[i][
j];
1023 Value iv = insideBuilder.create<arith::RemSIOp>(
1024 loc, previous, normalizedUpperBounds[idx]);
1030 previous = insideBuilder.create<arith::DivSIOp>(
1031 loc, previous, normalizedUpperBounds[idx]);
1035 unsigned idx = combinedDimensions[i][0];
1037 previous, loops.getRegion());
1042 loops.getBody()->back().
erase();
1043 newPloop.getBody()->getOperations().splice(
1045 loops.getBody()->getOperations());
1058 return op != inner.getOperation();
1061 LogicalResult status = success();
1063 for (
auto &op : outer.getBody()->without_terminator()) {
1065 if (&op == inner.getOperation())
1068 if (forwardSlice.count(&op) > 0) {
1073 if (isa<scf::ForOp>(op))
1086 toHoist.push_back(&op);
1088 auto *outerForOp = outer.getOperation();
1089 for (
auto *op : toHoist)
1099 LogicalResult status = success();
1100 const Loops &interTile = tileLoops.first;
1101 const Loops &intraTile = tileLoops.second;
1102 auto size = interTile.size();
1103 assert(size == intraTile.size());
1106 for (
unsigned s = 1; s < size; ++s)
1107 status = succeeded(status) ?
hoistOpsBetween(intraTile[0], intraTile[s])
1109 for (
unsigned s = 1; s < size; ++s)
1110 status = succeeded(status) ?
hoistOpsBetween(interTile[0], interTile[s])
1119 template <
typename T>
1123 for (
unsigned i = 0; i < maxLoops; ++i) {
1124 forOps.push_back(rootForOp);
1126 if (body.
begin() != std::prev(body.
end(), 2))
1129 rootForOp = dyn_cast<T>(&body.
front());
1137 auto originalStep = forOp.getStep();
1138 auto iv = forOp.getInductionVar();
1141 forOp.setStep(b.
create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
1144 for (
auto t : targets) {
1146 auto begin = t.getBody()->begin();
1147 auto nOps = t.getBody()->getOperations().size();
1151 Value stepped = b.
create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
1153 b.
create<arith::MinSIOp>(t.getLoc(), forOp.getUpperBound(), stepped);
1156 auto newForOp = b.
create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
1157 newForOp.getBody()->getOperations().splice(
1158 newForOp.getBody()->getOperations().begin(),
1159 t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
1161 newForOp.getRegion());
1163 innerLoops.push_back(newForOp);
1171 template <
typename SizeType>
1173 scf::ForOp target) {
1179 assert(res.size() == 1 &&
"Expected 1 inner forOp");
1188 for (
auto it : llvm::zip(forOps, sizes)) {
1189 auto step =
stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1190 res.push_back(step);
1191 currentTargets = step;
1197 scf::ForOp target) {
1200 assert(loops.size() == 1);
1201 res.push_back(loops[0]);
1210 forOps.reserve(sizes.size());
1212 if (forOps.size() < sizes.size())
1213 sizes = sizes.take_front(forOps.size());
1228 forOps.reserve(sizes.size());
1230 if (forOps.size() < sizes.size())
1231 sizes = sizes.take_front(forOps.size());
1238 tileSizes.reserve(sizes.size());
1239 for (
unsigned i = 0, e = sizes.size(); i < e; ++i) {
1240 assert(sizes[i] > 0 &&
"expected strictly positive size for strip-mining");
1242 auto forOp = forOps[i];
1244 auto loc = forOp.getLoc();
1245 Value diff = builder.
create<arith::SubIOp>(loc, forOp.getUpperBound(),
1246 forOp.getLowerBound());
1248 Value iterationsPerBlock =
1250 tileSizes.push_back(iterationsPerBlock);
1254 auto intraTile =
tile(forOps, tileSizes, forOps.back());
1255 TileLoops tileLoops = std::make_pair(forOps, intraTile);
1266 scf::ForallOp source,
1268 unsigned numTargetOuts = target.getNumResults();
1269 unsigned numSourceOuts = source.getNumResults();
1273 llvm::append_range(fusedOuts, target.getOutputs());
1274 llvm::append_range(fusedOuts, source.getOutputs());
1278 scf::ForallOp fusedLoop = rewriter.
create<scf::ForallOp>(
1279 source.getLoc(), source.getMixedLowerBound(), source.getMixedUpperBound(),
1280 source.getMixedStep(), fusedOuts, source.getMapping());
1284 mapping.
map(target.getInductionVars(), fusedLoop.getInductionVars());
1285 mapping.map(source.getInductionVars(), fusedLoop.getInductionVars());
1288 mapping.map(target.getRegionIterArgs(),
1289 fusedLoop.getRegionIterArgs().take_front(numTargetOuts));
1290 mapping.map(source.getRegionIterArgs(),
1291 fusedLoop.getRegionIterArgs().take_back(numSourceOuts));
1295 for (
Operation &op : target.getBody()->without_terminator())
1296 rewriter.
clone(op, mapping);
1297 for (
Operation &op : source.getBody()->without_terminator())
1298 rewriter.
clone(op, mapping);
1301 scf::InParallelOp targetTerm = target.getTerminator();
1302 scf::InParallelOp sourceTerm = source.getTerminator();
1303 scf::InParallelOp fusedTerm = fusedLoop.getTerminator();
1305 for (
Operation &op : targetTerm.getYieldingOps())
1306 rewriter.
clone(op, mapping);
1307 for (
Operation &op : sourceTerm.getYieldingOps())
1308 rewriter.
clone(op, mapping);
1311 rewriter.
replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));
1312 rewriter.
replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));
1320 unsigned numTargetOuts = target.getNumResults();
1321 unsigned numSourceOuts = source.getNumResults();
1325 llvm::append_range(fusedInitArgs, target.getInitArgs());
1326 llvm::append_range(fusedInitArgs, source.getInitArgs());
1331 scf::ForOp fusedLoop = rewriter.
create<scf::ForOp>(
1332 source.getLoc(), source.getLowerBound(), source.getUpperBound(),
1333 source.getStep(), fusedInitArgs);
1337 mapping.
map(target.getInductionVar(), fusedLoop.getInductionVar());
1338 mapping.map(target.getRegionIterArgs(),
1339 fusedLoop.getRegionIterArgs().take_front(numTargetOuts));
1340 mapping.map(source.getInductionVar(), fusedLoop.getInductionVar());
1341 mapping.map(source.getRegionIterArgs(),
1342 fusedLoop.getRegionIterArgs().take_back(numSourceOuts));
1346 for (
Operation &op : target.getBody()->without_terminator())
1347 rewriter.
clone(op, mapping);
1348 for (
Operation &op : source.getBody()->without_terminator())
1349 rewriter.
clone(op, mapping);
1353 for (
Value operand : target.getBody()->getTerminator()->getOperands())
1354 yieldResults.push_back(mapping.lookupOrDefault(operand));
1355 for (
Value operand : source.getBody()->getTerminator()->getOperands())
1356 yieldResults.push_back(mapping.lookupOrDefault(operand));
1357 if (!yieldResults.empty())
1358 rewriter.
create<scf::YieldOp>(source.getLoc(), yieldResults);
1361 rewriter.
replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));
1362 rewriter.
replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));
static std::optional< int64_t > getConstantTripCount(scf::ForOp forOp)
Returns the trip count of forOp if its' low bound, high bound and step are constants,...
static LogicalResult tryIsolateBands(const TileLoops &tileLoops)
static void getPerfectlyNestedLoopsImpl(SmallVectorImpl< T > &forOps, T rootForOp, unsigned maxLoops=std::numeric_limits< unsigned >::max())
Collect perfectly nested loops starting from rootForOps.
static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner)
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of scf::ForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor',...
static Loops stripmineSink(scf::ForOp forOp, Value factor, ArrayRef< scf::ForOp > targets)
static std::pair< SmallVector< Value >, SmallPtrSet< Operation *, 2 > > delinearizeInductionVariable(RewriterBase &rewriter, Location loc, Value linearizedIv, ArrayRef< Value > ubs)
For each original loop, the value of the induction variable can be obtained by dividing the induction...
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, int64_t divisor)
static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc, ArrayRef< Value > values)
Helper function to multiply a sequence of values.
static bool areInnerBoundsInvariant(scf::ForOp forOp)
Check if bounds of all inner loops are defined outside of forOp and return false if not.
static llvm::ManagedStatic< PassManagerOptions > options
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
This class represents an argument of a Block.
Block represents an ordered list of Operations.
OpListType::iterator iterator
unsigned getNumArguments()
Operation * getTerminator()
Get the terminator operation of this block.
BlockArgListType getArguments()
IntegerAttr getIndexAttr(int64_t value)
TypedAttr getZeroAttr(Type type)
MLIRContext * getContext() const
TypedAttr getOneAttr(Type type)
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
RAII guard to reset the insertion point of the builder when destroyed.
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Block * createBlock(Region *parent, Region::iterator insertPt={}, TypeRange argTypes=std::nullopt, ArrayRef< Location > locs=std::nullopt)
Add new block with 'argTypes' arguments and set the insertion point to the end of it.
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents a single result from folding an operation.
This class represents an operand of an operation.
This is a value defined by a result of an operation.
Operation is the basic unit of execution within MLIR.
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
unsigned getNumRegions()
Returns the number of regions held by this operation.
MutableArrayRef< Region > getRegions()
Returns the regions held by this operation.
operand_type_range getOperandTypes()
result_type_range getResultTypes()
operand_range getOperands()
Returns an iterator on the underlying Value's.
void moveBefore(Operation *existingOp)
Unlink this operation from its current block and insert it right before existingOp which may be in th...
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
void erase()
Remove this operation from its parent block and delete it.
This class contains a list of basic blocks and a link to the parent operation it is attached to.
BlockArgListType getArguments()
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
bool hasOneBlock()
Return true if this region has exactly one block.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
void mergeBlocks(Block *source, Block *dest, ValueRange argValues=std::nullopt)
Inline the operations of block 'source' into the end of block 'dest'.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void replaceAllUsesExcept(Value from, Value to, Operation *exceptedUser)
Find uses of from and replace them with to except if the user is exceptedUser.
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
virtual void inlineBlockBefore(Block *source, Block *dest, Block::iterator before, ValueRange argValues=std::nullopt)
Inline the operations of block 'source' into block 'dest' before the given position.
This class provides an abstraction over the various different ranges of value types.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
bool use_empty() const
Returns true if this value has no uses.
void replaceUsesWithIf(Value newValue, function_ref< bool(OpOperand &)> shouldReplace)
Replace all uses of 'this' value with 'newValue' if the given callback returns true.
Type getType() const
Return the type of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
static WalkResult advance()
static WalkResult interrupt()
Specialization of arith.constant op that returns an integer of index type.
Operation * getOwner() const
Return the owner of this operand.
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Include the generated interface declarations.
void getPerfectlyNestedLoops(SmallVectorImpl< scf::ForOp > &nestedLoops, scf::ForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
LogicalResult loopUnrollByFactor(scf::ForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr)
Unrolls this for operation by the specified unroll factor.
bool isConstantIntValue(OpFoldResult ofr, int64_t value)
Return true if ofr is constant integer equal to value.
LogicalResult outlineIfOp(RewriterBase &b, scf::IfOp ifOp, func::FuncOp *thenFn, StringRef thenFnName, func::FuncOp *elseFn, StringRef elseFnName)
Outline the then and/or else regions of ifOp as follows:
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region ®ion)
Replace all uses of orig within the given region with replacement.
SmallVector< scf::ForOp > replaceLoopNestWithNewYields(RewriterBase &rewriter, MutableArrayRef< scf::ForOp > loopNest, ValueRange newIterOperands, const NewYieldValuesFn &newYieldValuesFn, bool replaceIterOperandsUsesInLoop=true)
Update a perfectly nested loop nest to yield new values from the innermost loop and propagating it up...
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
LogicalResult coalescePerfectlyNestedSCFForLoops(scf::ForOp op)
Walk an affine.for to find a band to coalesce.
std::pair< Loops, Loops > TileLoops
Value getValueOrCreateConstantIntOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
void collapseParallelLoops(RewriterBase &rewriter, scf::ParallelOp loops, ArrayRef< std::vector< unsigned >> combinedDimensions)
Take the ParallelLoop and for each set of dimension indices, combine them into a single dimension.
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
std::function< SmallVector< Value >(OpBuilder &b, Location loc, ArrayRef< BlockArgument > newBbArgs)> NewYieldValuesFn
A function that returns the additional yielded values during replaceWithAdditionalYields.
Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef< Value > sizes)
Tile a nest of scf::ForOp loops rooted at rootForOp with the given (parametric) sizes.
LogicalResult loopUnrollJamByFactor(scf::ForOp forOp, uint64_t unrollFactor)
Unrolls and jams this scf.for operation by the specified unroll factor.
bool getInnermostParallelLoops(Operation *rootOp, SmallVectorImpl< scf::ParallelOp > &result)
Get a list of innermost parallel loops contained in rootOp.
void getUsedValuesDefinedAbove(Region ®ion, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
SmallVector< Loops, 8 > tile(ArrayRef< scf::ForOp > forOps, ArrayRef< Value > sizes, ArrayRef< scf::ForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
FailureOr< func::FuncOp > outlineSingleBlockRegion(RewriterBase &rewriter, Location loc, Region ®ion, StringRef funcName, func::CallOp *callOp=nullptr)
Outline a region with a single block into a new FuncOp.
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
void denormalizeInductionVariable(RewriterBase &rewriter, Location loc, Value normalizedIv, OpFoldResult origLb, OpFoldResult origStep)
Get back the original induction variable values after loop normalization.
scf::ForallOp fuseIndependentSiblingForallLoops(scf::ForallOp target, scf::ForallOp source, RewriterBase &rewriter)
Given two scf.forall loops, target and source, fuses target into source.
LogicalResult coalesceLoops(MutableArrayRef< scf::ForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
scf::ForOp fuseIndependentSiblingForLoops(scf::ForOp target, scf::ForOp source, RewriterBase &rewriter)
Given two scf.for loops, target and source, fuses target into source.
TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef< int64_t > sizes)
Range emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc, OpFoldResult lb, OpFoldResult ub, OpFoldResult step)
Materialize bounds and step of a zero-based and unit-step loop derived by normalizing the specified b...
void getForwardSlice(Operation *op, SetVector< Operation * > *forwardSlice, const ForwardSliceOptions &options={})
Fills forwardSlice with the computed forward slice (i.e.
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Represents a range (offset, size, and stride) where each element of the triple may be dynamic or stat...
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.