26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/DebugLog.h"
29 #include "llvm/Support/raw_ostream.h"
32 #define DEBUG_TYPE "loop-utils"
35 using namespace affine;
36 using namespace presburger;
37 using llvm::SmallMapVector;
58 auto lbMap = forOp.getLowerBoundMap();
59 auto lb = AffineApplyOp::create(b, forOp.getLoc(), lbMap,
60 forOp.getLowerBoundOperands());
69 int64_t step = forOp.getStepAsInt();
70 for (
unsigned i = 0, e = tripCountMap.
getNumResults(); i < e; i++) {
71 auto tripCountExpr = tripCountMap.
getResult(i);
72 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
76 AffineApplyOp::create(b, forOp.getLoc(), bumpMap, tripCountOperands);
80 for (
unsigned i = 0, e = bumpExprs.size(); i < e; i++)
83 cleanupLbOperands.clear();
84 cleanupLbOperands.push_back(lb);
85 cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
93 for (
auto v : bumpValues)
95 v.getDefiningOp()->erase();
105 auto iterOperands = forOp.getInits();
106 auto iterArgs = forOp.getRegionIterArgs();
107 for (
auto e : llvm::zip(iterOperands, iterArgs))
108 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
111 auto outerResults = forOp.getResults();
112 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
113 for (
auto e : llvm::zip(outerResults, innerResults))
114 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
121 if (!tripCount || *tripCount != 1)
125 if (forOp.getLowerBoundMap().getNumResults() != 1)
129 auto iv = forOp.getInductionVar();
130 auto *parentBlock = forOp->getBlock();
131 if (!iv.use_empty()) {
132 if (forOp.hasConstantLowerBound()) {
133 auto func = forOp->getParentOfType<FunctionOpInterface>();
136 builder.setInsertionPointToStart(&func.getFunctionBody().front());
138 builder.setInsertionPoint(forOp);
140 builder, forOp.getLoc(), forOp.getConstantLowerBound());
141 iv.replaceAllUsesWith(constOp);
143 auto lbOperands = forOp.getLowerBoundOperands();
144 auto lbMap = forOp.getLowerBoundMap();
148 iv.replaceAllUsesWith(lbOperands[0]);
151 AffineApplyOp::create(builder, forOp.getLoc(), lbMap, lbOperands);
152 iv.replaceAllUsesWith(affineApplyOp);
161 forOp.getBody()->back().erase();
163 forOp.getBody()->getOperations());
178 unsigned offset, AffineForOp srcForOp,
OpBuilder b) {
179 auto lbOperands = srcForOp.getLowerBoundOperands();
180 auto ubOperands = srcForOp.getUpperBoundOperands();
186 AffineForOp::create(b, srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
187 ubMap, srcForOp.getStepAsInt());
188 auto loopChunkIV = loopChunk.getInductionVar();
189 auto srcIV = srcForOp.getInductionVar();
194 for (
const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
195 uint64_t shift = it.first;
196 auto ops = it.second;
201 if (!srcIV.use_empty() && shift != 0) {
202 auto ivRemap = AffineApplyOp::create(
203 bodyBuilder, srcForOp.getLoc(),
204 bodyBuilder.getSingleDimShiftAffineMap(
205 -
static_cast<int64_t
>(srcForOp.getStepAsInt() * shift)),
207 operandMap.
map(srcIV, ivRemap);
209 operandMap.
map(srcIV, loopChunkIV);
212 bodyBuilder.clone(*op, operandMap);
215 return AffineForOp();
232 bool unrollPrologueEpilogue) {
233 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
234 "too few/many shifts");
235 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
243 if (!mayBeConstTripCount) {
244 LLVM_DEBUG(forOp.emitRemark(
"non-constant trip count loop not handled"));
247 uint64_t tripCount = *mayBeConstTripCount;
250 "shifts will lead to an invalid transformation\n");
252 int64_t step = forOp.getStepAsInt();
254 unsigned numChildOps = shifts.size();
257 uint64_t maxShift = *llvm::max_element(shifts);
258 if (maxShift >= numChildOps) {
260 forOp.emitWarning(
"not shifting because shifts are unrealistically large");
267 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
269 for (
auto &op : forOp.getBody()->without_terminator()) {
270 auto shift = shifts[pos++];
271 sortedOpGroups[shift].push_back(&op);
279 AffineForOp prologue, epilogue;
284 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
286 auto origLbMap = forOp.getLowerBoundMap();
287 uint64_t lbShift = 0;
289 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
291 if (sortedOpGroups[d].empty())
293 if (!opGroupQueue.empty()) {
295 "Queue expected to be empty when the first block is found");
300 if (lbShift + tripCount * step < d * step) {
304 opGroupQueue, 0, forOp, b);
306 opGroupQueue.clear();
307 lbShift += tripCount * step;
311 opGroupQueue, 0, forOp, b);
318 AffineForOp::getCanonicalizationPatterns(
patterns, res.getContext());
321 res.getOperation(), std::move(
patterns),
325 if (!erased && !prologue)
335 opGroupQueue.emplace_back(d, sortedOpGroups[d]);
340 for (
unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
341 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
344 opGroupQueue, i, forOp, b);
353 if (unrollPrologueEpilogue && prologue)
355 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
367 if (input.size() <= 1)
370 LDBG() <<
"Index set computation failed!";
374 LDBG() <<
"Non-hyperrectangular nests not supported for tiling!";
382 template <
typename t>
385 assert(input.size() == tileSizes.size() &&
"Too few/many tile sizes");
387 if (llvm::any_of(input,
388 [](AffineForOp op) {
return op.getNumResults() > 0; })) {
389 LDBG() <<
"Cannot tile nest where a loop has yield values";
395 LDBG() <<
"input loops not perfectly nested";
410 auto &ops = src.getBody()->getOperations();
411 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
412 std::prev(ops.end()));
424 AffineForOp rootAffineForOp,
unsigned width,
426 Location loc = rootAffineForOp.getLoc();
429 Operation *topLoop = rootAffineForOp.getOperation();
430 AffineForOp innermostPointLoop;
433 for (
unsigned i = 0; i < width; i++) {
436 AffineForOp pointLoop = AffineForOp::create(b, loc, 0, 0);
437 pointLoop.getBody()->getOperations().splice(
440 tiledLoops[2 * width - 1 - i] = pointLoop;
441 topLoop = pointLoop.getOperation();
443 innermostPointLoop = pointLoop;
447 for (
unsigned i = width; i < 2 * width; i++) {
450 AffineForOp tileSpaceLoop = AffineForOp::create(b, loc, 0, 0);
451 tileSpaceLoop.getBody()->getOperations().splice(
454 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
455 topLoop = tileSpaceLoop.getOperation();
465 AffineForOp newInterTileLoop,
466 AffineForOp newIntraTileLoop,
476 assert(origLoop.hasConstantLowerBound() &&
477 "expected input loops to have constant lower bound.");
499 lbOperands.push_back(newInterTileLoop.getInductionVar());
500 ubOperands.push_back(newInterTileLoop.getInductionVar());
514 lbOperands.push_back(tileSize);
515 ubOperands.push_back(tileSize);
527 lbBoundExprs.push_back(
528 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
536 ubBoundExprs.push_back(
537 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
538 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
540 ubBoundExprs.append(origUbMap.
getResults().begin(),
546 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
551 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
554 newIntraTileLoop.setStep(origLoop.getStepAsInt());
560 AffineForOp newLoop,
Value tileSize) {
561 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
565 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
576 assert(origLoop.hasConstantLowerBound() &&
577 "expected input loops to have constant lower bound.");
597 ubOperands.push_back(tileSize);
604 int64_t origUpperBound;
609 if (origLoop.hasConstantUpperBound()) {
610 origUpperBound = origLoop.getConstantUpperBound();
617 boundExprs.push_back(
619 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
640 boundExprs.push_back(
642 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
648 newLoop.setUpperBound(ubOperands, ubMap);
651 newLoop.setStep(origLoop.getStepAsInt());
663 assert(!origLoops.empty() &&
"expected atleast one loop in band");
664 assert(origLoops.size() == tileSizes.size() &&
665 "expected tiling parameter for each loop in band.");
667 OpBuilder b(origLoops[0].getOperation());
668 unsigned width = origLoops.size();
671 for (
unsigned i = 0; i < width; ++i) {
676 for (
unsigned i = 0; i < width; ++i) {
678 newLoops[i + width], tileSizes[i]);
691 assert(!origLoops.empty());
692 assert(origLoops.size() == tileSizes.size());
694 OpBuilder b(origLoops[0].getOperation());
695 unsigned width = origLoops.size();
698 for (
unsigned i = 0; i < width; i++) {
699 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
700 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
701 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
702 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
705 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
708 for (
unsigned i = 0; i < width; i++) {
710 std::optional<uint64_t> mayBeConstantCount =
714 newLoops[width + i].setLowerBound(
715 newLoops[i].getInductionVar(), lbMap);
717 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
720 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
724 *mayBeConstantCount * origLoops[i].getStepAsInt());
725 newLoops[width + i].setUpperBound(
726 newLoops[i].getInductionVar(), ubMap);
727 }
else if (largestDiv % tileSizes[i] != 0) {
743 ubOperands.push_back(newLoops[i].getInductionVar());
754 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
755 boundExprs.append(origUbMap.
getResults().begin(),
760 newLoops[width + i].setUpperBound(ubOperands, ubMap);
765 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
766 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
782 AffineForOp rootAffineForOp = origLoops[0];
785 unsigned width = input.size();
799 for (
unsigned i = 0; i < width; i++)
800 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
803 rootAffineForOp.erase();
806 *tiledNest = std::move(tiledLoops);
824 AffineForOp rootAffineForOp = origLoops[0];
825 unsigned width = input.size();
840 for (
unsigned i = 0; i < width; i++)
841 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
844 rootAffineForOp.erase();
847 *tiledNest = std::move(tiledLoops);
859 nestedLoops.push_back(root);
861 if (body.
begin() != std::prev(body.
end(), 2))
864 root = dyn_cast<AffineForOp>(&body.
front());
873 if (mayBeConstantTripCount.has_value()) {
874 uint64_t tripCount = *mayBeConstantTripCount;
887 uint64_t unrollFactor) {
889 if (mayBeConstantTripCount.has_value() &&
890 *mayBeConstantTripCount < unrollFactor)
900 Block *loopBodyBlock,
Value forOpIV, uint64_t unrollFactor,
910 annotateFn = defaultAnnotateFn;
919 for (
unsigned i = 1; i < unrollFactor; i++) {
923 operandMap.
map(iterArgs, lastYielded);
928 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
929 operandMap.
map(forOpIV, ivUnroll);
933 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++) {
935 annotateFn(i, clonedOp, builder);
942 for (
unsigned i = 0, e = lastYielded.size(); i < e; i++) {
943 Operation *defOp = yieldedValues[i].getDefiningOp();
944 if (defOp && defOp->
getBlock() == loopBodyBlock)
945 lastYielded[i] = operandMap.
lookup(yieldedValues[i]);
951 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++)
952 annotateFn(0, &*it, builder);
961 uint64_t unrollFactor) {
964 auto cleanupForOp = cast<AffineForOp>(builder.
clone(*forOp));
968 auto results = forOp.getResults();
969 auto cleanupResults = cleanupForOp.getResults();
970 auto cleanupIterOperands = cleanupForOp.getInits();
972 for (
auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
973 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
974 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
983 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
989 forOp.setUpperBound(cleanupOperands, cleanupMap);
996 AffineForOp forOp, uint64_t unrollFactor,
998 bool cleanUpUnroll) {
999 assert(unrollFactor > 0 &&
"unroll factor should be positive");
1002 if (unrollFactor == 1) {
1009 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1013 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1014 if (cleanUpUnroll) {
1028 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1029 forOp.getUpperBoundMap().getNumResults() != 1)
1035 assert(
false &&
"cleanup loop lower bound map for single result lower "
1036 "and upper bound maps can always be determined");
1039 ValueRange iterArgs(forOp.getRegionIterArgs());
1040 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1043 int64_t step = forOp.getStepAsInt();
1044 forOp.setStep(step * unrollFactor);
1046 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1049 auto d0 = b.getAffineDimExpr(0);
1050 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1051 return AffineApplyOp::create(b, forOp.getLoc(), bumpMap, iv);
1054 iterArgs, yieldedValues);
1062 uint64_t unrollJamFactor) {
1064 if (mayBeConstantTripCount.has_value() &&
1065 *mayBeConstantTripCount < unrollJamFactor)
1073 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1074 for (
auto controlOperand : aForOp.getControlOperands()) {
1075 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1076 return WalkResult::interrupt();
1080 return !walkResult.wasInterrupted();
1085 uint64_t unrollJamFactor) {
1086 assert(unrollJamFactor > 0 &&
"unroll jam factor should be positive");
1089 if (unrollJamFactor == 1) {
1096 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1100 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1101 LDBG() <<
"[failed] trip count < unroll-jam factor";
1117 forOp.walk([&](AffineForOp aForOp) {
1118 if (aForOp.getNumIterOperands() > 0)
1119 loopsWithIterArgs.push_back(aForOp);
1124 if (forOp.getNumIterOperands() > 0)
1134 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1135 forOp.getUpperBoundMap().getNumResults() != 1)
1138 assert(
false &&
"cleanup loop lower bound map for single result lower "
1139 "and upper bound maps can always be determined");
1151 for (AffineForOp oldForOp : loopsWithIterArgs) {
1153 ValueRange oldIterOperands = oldForOp.getInits();
1154 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1156 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1159 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1160 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1161 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1165 bool forOpReplaced = oldForOp == forOp;
1166 AffineForOp newForOp =
1167 cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1168 rewriter, dupIterOperands,
false,
1170 return dupYieldOperands;
1172 newLoopsWithIterArgs.push_back(newForOp);
1177 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1178 unsigned oldNumIterArgs = oldIterArgs.size();
1179 ValueRange newResults = newForOp.getResults();
1180 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1181 assert(oldNumIterArgs == oldNumResults &&
1182 "oldNumIterArgs must be the same as oldNumResults");
1183 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1184 for (
unsigned j = 0;
j < oldNumIterArgs; ++
j) {
1188 operandMaps[i - 1].map(newIterArgs[
j],
1189 newIterArgs[i * oldNumIterArgs +
j]);
1190 operandMaps[i - 1].map(newResults[
j],
1191 newResults[i * oldNumResults +
j]);
1197 int64_t step = forOp.getStepAsInt();
1198 forOp.setStep(step * unrollJamFactor);
1200 auto forOpIV = forOp.getInductionVar();
1202 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1203 for (
auto &subBlock : subBlocks) {
1206 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1215 AffineApplyOp::create(builder, forOp.getLoc(), bumpMap, forOpIV);
1216 operandMaps[i - 1].map(forOpIV, ivUnroll);
1219 for (
auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1220 builder.
clone(*it, operandMaps[i - 1]);
1223 for (
auto newForOp : newLoopsWithIterArgs) {
1224 unsigned oldNumIterOperands =
1225 newForOp.getNumIterOperands() / unrollJamFactor;
1226 unsigned numControlOperands = newForOp.getNumControlOperands();
1227 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1228 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1229 assert(oldNumIterOperands == oldNumYieldOperands &&
1230 "oldNumIterOperands must be the same as oldNumYieldOperands");
1231 for (
unsigned j = 0;
j < oldNumIterOperands; ++
j) {
1235 newForOp.setOperand(numControlOperands + i * oldNumIterOperands +
j,
1236 operandMaps[i - 1].lookupOrDefault(
1237 newForOp.getOperand(numControlOperands +
j)));
1239 i * oldNumYieldOperands +
j,
1240 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(
j)));
1244 if (forOp.getNumResults() > 0) {
1250 auto loc = forOp.getLoc();
1251 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1253 unsigned pos = reduction.iterArgPosition;
1254 Value lhs = forOp.getResult(pos);
1257 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1258 rhs = forOp.getResult(i * oldNumResults + pos);
1264 assert(op &&
"Reduction op should have been created");
1268 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1280 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1281 auto &forOpABody = forOpA.getBody()->getOperations();
1282 auto &forOpBBody = forOpB.getBody()->getOperations();
1288 forOpABody, forOpABody.begin(),
1289 std::prev(forOpABody.end()));
1292 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1293 std::prev(forOpBBody.end()));
1295 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1306 unsigned maxLoopDepth = loops.size();
1308 loopPermMapInv.resize(maxLoopDepth);
1309 for (
unsigned i = 0; i < maxLoopDepth; ++i)
1310 loopPermMapInv[loopPermMap[i]] = i;
1317 for (
const auto &depComps : depCompsVec) {
1318 assert(depComps.size() >= maxLoopDepth);
1321 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1322 unsigned permIndex = loopPermMapInv[
j];
1323 assert(depComps[permIndex].lb);
1324 int64_t depCompLb = *depComps[permIndex].lb;
1338 assert(loopPermMap.size() == loops.size() &&
"invalid loop perm map");
1339 unsigned maxLoopDepth = loops.size();
1340 if (maxLoopDepth == 1)
1346 if (llvm::any_of(loops, [](AffineForOp loop) {
1347 return loop.getNumIterOperands() > 0;
1353 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1360 [[maybe_unused]]
bool
1362 assert(!loops.empty() &&
"no loops provided");
1365 auto hasTwoElements = [](
Block *block) {
1366 auto secondOpIt = std::next(block->begin());
1367 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1370 auto enclosingLoop = loops.front();
1371 for (
auto loop : loops.drop_front()) {
1372 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1374 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1376 enclosingLoop = loop;
1385 assert(input.size() == permMap.size() &&
"invalid permutation map size");
1389 llvm::sort(checkPermMap);
1391 [](
const auto &en) {
return en.value() != en.index(); }))
1392 assert(
false &&
"invalid permutation map");
1395 if (input.size() < 2)
1403 for (
unsigned i = 0, e = input.size(); i < e; ++i)
1404 invPermMap.push_back({permMap[i], i});
1405 llvm::sort(invPermMap);
1409 if (permMap.back() != input.size() - 1) {
1410 Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1411 Block *srcBody = ((AffineForOp)input.back()).getBody();
1414 std::prev(srcBody->
end()));
1419 for (
int i = input.size() - 1; i >= 0; --i) {
1422 if (permMap[i] == 0) {
1427 auto *parentBlock = input[0]->getBlock();
1429 input[i]->getBlock()->getOperations(),
1436 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1437 if (i > 0 &&
static_cast<unsigned>(i - 1) == parentPosInInput)
1441 auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1442 destBody->getOperations().splice(destBody->begin(),
1443 input[i]->getBlock()->getOperations(),
1447 return invPermMap[0].second;
1456 if (loops.size() < 2)
1461 unsigned maxLoopDepth = loops.size();
1462 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1467 for (
auto &depComps : depCompsVec) {
1468 assert(depComps.size() >= maxLoopDepth);
1469 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1471 assert(depComp.
lb.has_value() && depComp.
ub.has_value());
1472 if (*depComp.
lb != 0 || *depComp.
ub != 0)
1482 unsigned nextSequentialLoop = numParallelLoops;
1483 unsigned nextParallelLoop = 0;
1484 for (
unsigned i = 0; i < maxLoopDepth; ++i) {
1486 loopPermMap[i] = nextParallelLoop++;
1488 loopPermMap[i] = nextSequentialLoop++;
1496 unsigned loopNestRootIndex =
permuteLoops(loops, loopPermMap);
1497 return loops[loopNestRootIndex];
1511 int64_t offset = 0) {
1512 auto bounds = llvm::to_vector<4>(map->
getResults());
1514 operands->insert(operands->begin() + map->
getNumDims(), iv);
1531 auto originalStep = forOp.getStepAsInt();
1532 auto scaledStep = originalStep * factor;
1533 forOp.setStep(scaledStep);
1538 auto lbMap = forOp.getLowerBoundMap();
1543 auto ubMap = forOp.getUpperBoundMap();
1548 auto iv = forOp.getInductionVar();
1550 for (
auto t : targets) {
1553 auto newForOp = AffineForOp::create(b, t.getLoc(), lbOperands, lbMap,
1554 ubOperands, ubMap, originalStep);
1555 auto begin = t.getBody()->begin();
1557 auto nOps = t.getBody()->getOperations().size() - 2;
1558 newForOp.getBody()->getOperations().splice(
1559 newForOp.getBody()->getOperations().begin(),
1560 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1562 newForOp.getRegion());
1563 innerLoops.push_back(newForOp);
1571 template <
typename SizeType>
1573 AffineForOp target) {
1579 assert(res.size() == 1 &&
"Expected 1 inner forOp");
1588 for (
auto it : llvm::zip(forOps, sizes)) {
1589 auto step =
stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1590 res.push_back(step);
1591 currentTargets = step;
1598 AffineForOp target) {
1601 res.push_back(llvm::getSingleElement(loops));
1606 if (loops.size() < 2)
1609 AffineForOp innermost = loops.back();
1610 AffineForOp outermost = loops.front();
1615 for (AffineForOp loop : loops) {
1617 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1618 loop.getConstantLowerBound() != 0)
1627 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1628 prev = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1630 prev = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1631 upperBoundSymbols.push_back(prev);
1635 for (AffineForOp loop : loops.drop_front()) {
1636 ub = loop.getUpperBound();
1641 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1642 upperBound = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1644 upperBound = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1645 upperBoundSymbols.push_back(upperBound);
1647 operands.push_back(prev);
1648 operands.push_back(upperBound);
1650 prev = AffineApplyOp::create(
1662 outermost.setUpperBound(prev, newUbMap);
1674 Value previous = outermost.getInductionVar();
1675 for (
unsigned idx = loops.size(); idx > 0; --idx) {
1676 if (idx != loops.size()) {
1678 operands.push_back(previous);
1679 operands.push_back(upperBoundSymbols[idx]);
1680 previous = AffineApplyOp::create(builder, loc,
1689 Value inductionVariable;
1691 inductionVariable = previous;
1694 applyOperands.push_back(previous);
1695 applyOperands.push_back(upperBoundSymbols[idx - 1]);
1696 inductionVariable = AffineApplyOp::create(
1704 inductionVariable, loops.back().getRegion());
1709 AffineForOp secondOutermostLoop = loops[1];
1710 innermost.getBody()->back().erase();
1711 outermost.getBody()->getOperations().splice(
1713 innermost.getBody()->getOperations());
1714 secondOutermostLoop.erase();
1721 assert(processorId.size() == numProcessors.size());
1722 if (processorId.empty())
1732 Value linearIndex = processorId.front();
1733 for (
unsigned i = 1, e = processorId.size(); i < e; ++i) {
1734 auto mulApplyOp = AffineApplyOp::create(
1735 b, loc, mulMap,
ValueRange{linearIndex, numProcessors[i]});
1736 linearIndex = AffineApplyOp::create(b, loc, addMap,
1740 auto mulApplyOp = AffineApplyOp::create(
1741 b, loc, mulMap,
ValueRange{linearIndex, forOp.getStep()});
1742 Value lb = AffineApplyOp::create(
1743 b, loc, addMap,
ValueRange{mulApplyOp, forOp.getLowerBound()});
1744 forOp.setLowerBound(lb);
1746 Value step = forOp.getStep();
1747 for (
auto numProcs : numProcessors)
1748 step = AffineApplyOp::create(b, loc, mulMap,
ValueRange{numProcs, step});
1749 forOp.setStep(step);
1760 Block **copyPlacementBlock,
1765 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1771 auto it = enclosingAffineOps.rbegin();
1772 AffineForOp lastInvariantFor;
1773 for (
auto e = enclosingAffineOps.rend(); it != e; ++it) {
1778 LDBG() <<
"memref definition will end up not dominating hoist location";
1782 auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1787 if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1789 lastInvariantFor = affineFor;
1792 if (it != enclosingAffineOps.rbegin()) {
1794 *copyOutPlacementStart = std::next(*copyInPlacementStart);
1795 *copyPlacementBlock = lastInvariantFor->getBlock();
1797 *copyInPlacementStart = begin;
1798 *copyOutPlacementStart = end;
1799 *copyPlacementBlock = █
1817 if (bufferShape.size() <= 1)
1820 int64_t numEltPerStride = 1;
1822 for (
int d = bufferShape.size() - 1; d >= 1; d--) {
1823 int64_t dimSize = cast<MemRefType>(region.
memref.
getType()).getDimSize(d);
1825 numEltPerStride *= bufferShape[d];
1829 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1830 strideInfos->push_back({stride, numEltPerStride});
1855 assert(llvm::all_of(lbMaps, [&](
AffineMap lbMap) {
1858 assert(llvm::all_of(ubMaps, [&](
AffineMap ubMap) {
1862 unsigned rank = cast<MemRefType>(memref.
getType()).getRank();
1864 assert(rank != 0 &&
"non-zero rank memref expected");
1865 assert(lbMaps.size() == rank &&
"wrong number of lb maps");
1866 assert(ubMaps.size() == rank &&
"wrong number of ub maps");
1871 AffineForOp copyNestRoot;
1873 for (
unsigned d = 0; d < rank; ++d) {
1875 ubOperands, ubMaps[d]);
1877 copyNestRoot = forOp;
1881 auto fastBufOffsetMap =
1883 auto offset = AffineApplyOp::create(b, loc, fastBufOffsetMap, lbOperands);
1889 fastBufMapOperands.push_back(offset);
1890 fastBufMapOperands.push_back(forOp.getInductionVar());
1891 mayBeDeadApplys.push_back(offset);
1894 memIndices.push_back(forOp.getInductionVar());
1904 for (
auto applyOp : mayBeDeadApplys)
1905 if (applyOp.use_empty())
1910 auto load = AffineLoadOp::create(b, loc, memref, memIndices);
1911 AffineStoreOp::create(b, loc, load, fastMemRef, fastBufMap,
1912 fastBufMapOperands);
1913 return copyNestRoot;
1918 AffineLoadOp::create(b, loc, fastMemRef, fastBufMap, fastBufMapOperands);
1919 AffineStoreOp::create(b, loc, load, memref, memIndices);
1920 return copyNestRoot;
1951 auto f = begin->getParentOfType<FunctionOpInterface>();
1952 OpBuilder topBuilder(f.getFunctionBody());
1964 Operation *lastCopyOp = end->getPrevNode();
1968 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1971 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1973 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1981 auto loc = region.
loc;
1982 auto memref = region.
memref;
1983 auto memRefType = cast<MemRefType>(memref.getType());
1985 if (!memRefType.getLayout().isIdentity()) {
1986 LDBG() <<
"Non-identity layout map not yet supported";
1996 unsigned rank = memRefType.getRank();
1998 LDBG() <<
"Non-zero ranked memrefs supported";
2007 std::optional<int64_t> numElements =
2010 LDBG() <<
"Non-constant region size not supported";
2016 LDBG() <<
"Max lower bound for memref region start not supported";
2020 if (*numElements == 0) {
2021 LDBG() <<
"Nothing to copy";
2026 for (
unsigned i = 0; i < rank; ++i) {
2028 if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2029 LDBG() <<
"Missing lower or upper bound for region along dimension: "
2049 fastBufOffsets.reserve(rank);
2050 for (
unsigned d = 0; d < rank; d++) {
2051 assert(lbs[d].getNumSymbols() == cst->
getNumCols() - rank - 1 &&
2052 "incorrect bound size");
2056 if (lbs[d].isSingleConstant()) {
2057 auto indexVal = lbs[d].getSingleConstantResult();
2058 if (indexVal == 0) {
2059 memIndices.push_back(zeroIndex);
2061 memIndices.push_back(
2071 for (
unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2072 symReplacements[i] = top.getAffineDimExpr(i);
2073 lbs[d] = lbs[d].replaceDimsAndSymbols(
2074 {}, symReplacements, lbs[d].getNumSymbols(),
2076 memIndices.push_back(
2077 AffineApplyOp::create(b, loc, lbs[d], regionSymbols));
2080 bufIndices.push_back(zeroIndex);
2084 fastBufOffsets.push_back(lbs[d].getResult(0));
2091 bool existingBuf = fastBufferMap.count(memref) > 0;
2094 auto fastMemRefType =
2101 memref::AllocOp::create(prologue, loc, fastMemRefType).getResult();
2103 fastBufferMap[memref] = fastMemRef;
2107 *sizeInBytes = maySizeInBytes.value_or(0);
2110 <<
"Creating fast buffer of type " << fastMemRefType
2115 fastMemRef = fastBufferMap[memref];
2121 Value numEltPerDmaStride;
2128 if (dmaStrideInfos.size() > 1) {
2129 LDBG() <<
"Only up to one level of stride supported";
2133 if (!dmaStrideInfos.empty()) {
2137 top, loc, dmaStrideInfos[0].numEltPerStride);
2151 regionSymbols, ubMaps,
2152 regionSymbols, fastBufOffsets,
2156 copyNests.insert(copyNest);
2160 if (region.
isWrite() && isCopyOutAtEndOfBlock)
2167 auto tagMemRef = memref::AllocOp::create(prologue, loc, tagMemRefType);
2175 fastMemRef, bufAffineMap, bufIndices, tagMemRef,
2176 tagAffineMap, tagIndices, numElementsSSA,
2177 dmaStride, numEltPerDmaStride);
2181 b, loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2182 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2183 dmaStride, numEltPerDmaStride);
2186 if (isCopyOutAtEndOfBlock)
2195 auto tagDeallocOp = memref::DeallocOp::create(epilogue, loc, tagMemRef);
2196 if (*nEnd == end && isCopyOutAtEndOfBlock)
2204 auto bufDeallocOp = memref::DeallocOp::create(epilogue, loc, fastMemRef);
2207 if (!copyOptions.
generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2219 remapExprs.reserve(rank);
2220 for (
unsigned i = 0; i < rank; i++) {
2225 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2227 auto indexRemap =
AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2232 bool isBeginAtStartOfBlock = (begin == block->
begin());
2233 if (!isBeginAtStartOfBlock)
2234 prevOfBegin = std::prev(begin);
2236 auto userFilterFn = [&](
Operation *user) {
2248 *nBegin = isBeginAtStartOfBlock ? block->
begin() : std::next(prevOfBegin);
2260 if (
auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2261 rank = loadOp.getMemRefType().getRank();
2262 region->
memref = loadOp.getMemRef();
2264 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2265 rank = storeOp.getMemRefType().getRank();
2266 region->
memref = storeOp.getMemRef();
2269 assert(
false &&
"expected load or store op");
2272 auto memRefType = cast<MemRefType>(region->
memref.
getType());
2273 if (!memRefType.hasStaticShape())
2282 ivs.resize(numParamLoopIVs);
2286 regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2289 for (
unsigned d = 0; d < rank; d++) {
2290 auto dimSize = memRefType.getDimSize(d);
2291 assert(dimSize > 0 &&
"filtered dynamic shapes above");
2292 regionCst->addBound(BoundType::LB, d, 0);
2293 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2301 std::optional<Value> filterMemRef,
2306 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2307 "Inconsistent block begin/end args");
2308 assert(end != end->getBlock()->end() &&
"end can't be the block terminator");
2310 Block *block = begin->getBlock();
2316 LDBG() <<
"Generating copies at depth " << copyDepth;
2317 LDBG() <<
"from begin: "
2319 LDBG() <<
"to inclusive end: "
2325 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2326 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2338 MemRefType memrefType;
2340 if (
auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2341 memref = loadOp.getMemRef();
2342 memrefType = loadOp.getMemRefType();
2343 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2344 memref = storeOp.getMemRef();
2345 memrefType = storeOp.getMemRefType();
2351 if ((filterMemRef.has_value() && filterMemRef != memref) ||
2352 (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2357 LDBG() <<
"memref definition is inside of the depth at "
2358 <<
"which copy-in/copy-out would happen";
2363 auto region = std::make_unique<MemRefRegion>(opInst->
getLoc());
2364 if (
failed(region->compute(opInst, copyDepth,
nullptr,
2366 LDBG() <<
"Error obtaining memory region: semi-affine maps?";
2367 LDBG() <<
"over-approximating to the entire memref";
2368 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2369 LDBG() <<
"non-constant memref sizes not yet supported";
2390 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2392 const auto *
const it = targetRegions.find(region->memref);
2393 if (it == targetRegions.end())
2397 if (
failed(it->second->unionBoundingBox(*region))) {
2398 LDBG() <<
"Memory region bounding box failed; "
2399 <<
"over-approximating to the entire memref";
2402 LDBG() <<
"non-constant memref sizes not yet supported";
2406 it->second->getConstraints()->clearAndCopyFrom(
2407 *region->getConstraints());
2410 region->getConstraints()->clearAndCopyFrom(
2411 *it->second->getConstraints());
2416 bool existsInRead = updateRegion(readRegions);
2419 bool existsInWrite = updateRegion(writeRegions);
2424 if (region->isWrite() && !existsInWrite) {
2425 writeRegions[region->memref] = std::move(region);
2426 }
else if (!region->isWrite() && !existsInRead) {
2427 readRegions[region->memref] = std::move(region);
2432 LDBG() <<
"copy generation failed for one or more memref's in this block";
2436 uint64_t totalCopyBuffersSizeInBytes = 0;
2438 auto processRegions =
2439 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2441 for (
const auto ®ionEntry : regions) {
2445 Block *copyPlacementBlock;
2447 *regionEntry.second, *block, begin, end, ©PlacementBlock,
2448 ©InPlacementStart, ©OutPlacementStart);
2450 uint64_t sizeInBytes;
2453 *regionEntry.second, block, begin, end, copyPlacementBlock,
2454 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2455 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2456 if (succeeded(iRet)) {
2460 totalCopyBuffersSizeInBytes += sizeInBytes;
2462 ret = ret & succeeded(iRet);
2465 processRegions(readRegions);
2466 processRegions(writeRegions);
2469 LDBG() <<
"copy generation failed for one or more memref's in this block";
2475 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2476 LLVM_DEBUG(forOp.emitRemark()
2478 <<
" KiB of copy buffers in fast memory space for this block");
2481 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2482 block->getParentOp()->emitWarning(
2483 "total size of all copy buffers' for this block exceeds fast memory "
2496 std::prev(forOp.getBody()->end()), copyOptions,
2497 filterMemRef, copyNests);
2504 auto begin = analyzedOp->getIterator();
2505 auto end = std::next(begin);
2509 auto err =
generateCopy(memrefRegion, block, begin, end, block, begin, end,
2510 copyOptions, fastBufferMap, copyNests,
2515 const auto &en = fastBufferMap.find(memrefRegion.
memref);
2517 if (en == fastBufferMap.end())
2519 result.
alloc = en->second.getDefiningOp();
2520 assert(result.
alloc &&
"fast buffer expected to be locally allocated");
2521 assert(copyNests.size() <= 1 &&
"At most one copy nest is expected.");
2522 result.
copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2531 assert(currLoopDepth <= depthToLoops.size() &&
"Unexpected currLoopDepth");
2532 if (currLoopDepth == depthToLoops.size())
2533 depthToLoops.emplace_back();
2535 for (
auto &op : *block) {
2536 if (
auto forOp = dyn_cast<AffineForOp>(op)) {
2537 depthToLoops[currLoopDepth].push_back(forOp);
2546 for (
auto &block : func)
2550 if (!depthToLoops.empty()) {
2551 assert(depthToLoops.back().empty() &&
"Last loop level is not empty?");
2552 depthToLoops.pop_back();
2569 return AffineForOp::create(b, loc, lowerOperands, lbMap, upperOperands, ubMap,
2583 auto *context = loops[0].getContext();
2587 llvm::append_range(ops, loops);
2597 for (
auto loop : loops) {
2600 assert(loop.getStepAsInt() == 1 &&
"point loop step expected to be one");
2604 unsigned fullTileLbPos, fullTileUbPos;
2606 .getConstantBoundOnDimSize(0,
nullptr,
2608 nullptr, &fullTileLbPos,
2610 LDBG() <<
"Can't get constant diff pair for a loop";
2619 fullTileLb.assign(fLb.begin(), fLb.end());
2620 fullTileUb.assign(fUb.begin(), fUb.end());
2623 for (
auto lbIndex : lbIndices)
2624 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2625 cst.
atIneq(lbIndex, i) = fullTileLb[i] - cst.
atIneq(lbIndex, i);
2628 for (
auto ubIndex : ubIndices)
2629 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2630 cst.
atIneq(ubIndex, i) -= fullTileUb[i];
2653 return AffineIfOp::create(b, loops[0].getLoc(), ifCondSet, setOperands,
2658 static LogicalResult
2661 fullTileLoops.reserve(inputNest.size());
2666 for (
auto loop : inputNest) {
2668 if (loop.getStepAsInt() != 1) {
2669 LDBG() <<
"[tile separation] non-unit stride not implemented";
2677 unsigned lbPos, ubPos;
2679 .getConstantBoundOnDimSize(0,
nullptr,
2681 nullptr, &lbPos, &ubPos) ||
2683 LDBG() <<
"[tile separation] Can't get constant diff / "
2684 <<
"equalities not yet handled";
2693 cst.getIneqAsAffineValueMap(0, lbPos, lbVmap, b.
getContext());
2694 cst.getIneqAsAffineValueMap(0, ubPos, ubVmap, b.
getContext());
2699 fullTileLoops.push_back(fullTileLoop);
2705 operandMap.
map(loopEn.value().getInductionVar(),
2706 fullTileLoops[loopEn.index()].getInductionVar());
2708 for (
auto &op : inputNest.back().getBody()->without_terminator())
2709 b.
clone(op, operandMap);
2716 if (inputNest.empty())
2719 auto firstLoop = inputNest[0];
2722 auto prevLoop = firstLoop;
2723 for (
auto loop : inputNest.drop_front(1)) {
2724 assert(loop->getParentOp() == prevLoop &&
"input not contiguously nested");
2732 if (!fullTileLoops.empty())
2733 fullTileLoops.front().erase();
2741 fullTileLoops.front().erase();
2742 LDBG() <<
"All tiles are full tiles, or failure creating "
2743 <<
"separation condition";
2748 Block *thenBlock = ifOp.getThenBlock();
2749 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2751 std::prev(thenBlock->
end()),
2752 outermostFullTileLoop->getBlock()->getOperations(),
2757 Block *elseBlock = ifOp.getElseBlock();
2759 firstLoop->getBlock()->getOperations(),
2763 *fullTileNest = std::move(fullTileLoops);
2769 LogicalResult result(failure());
2772 if (loops.size() <= 1)
2780 for (
unsigned i = 0, e = loops.size(); i < e; ++i) {
2781 operandsDefinedAbove[i] = i;
2782 for (
unsigned j = 0;
j < i; ++
j) {
2784 operandsDefinedAbove[i] =
j;
2793 for (
unsigned end = loops.size(); end > 0; --end) {
2795 for (; start < end - 1; ++start) {
2797 *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2798 std::next(operandsDefinedAbove.begin(), end));
2801 assert(maxPos == start &&
2802 "expected loop bounds to be known at the start of the band");
2810 if (start != end - 1)
2819 while (
auto loopOp = currentOp->
getParentOfType<LoopLikeOpInterface>()) {
2820 if (!loopOp.isDefinedOutsideOfLoop(operand.
get()))
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static InFlightDiagnostic emitRemarkForBlock(Block &block)
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
static void findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static LogicalResult generateCopy(const MemRefRegion ®ion, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions ©Options, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > ©Nests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region (memref has to be non-zer...
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
static void getMultiLevelStrides(const MemRefRegion ®ion, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to a non-zero ranked ‘memref’ to/from ‘fastMemRef’ and returns the o...
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
unsigned getNumDims() const
ArrayRef< AffineExpr > getResults() const
unsigned getNumResults() const
unsigned getNumInputs() const
AffineExpr getResult(unsigned idx) const
Block represents an ordered list of Operations.
OpListType::iterator iterator
Operation * findAncestorOpInBlock(Operation &op)
Returns 'op' if 'op' lies in this block, or otherwise finds the ancestor operation of 'op' that lies ...
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Operation * getTerminator()
Get the terminator operation of this block.
OpListType & getOperations()
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
AffineMap getDimIdentityMap()
AffineMap getMultiDimIdentityMap(unsigned rank)
AffineExpr getAffineSymbolExpr(unsigned position)
AffineExpr getAffineConstantExpr(int64_t constant)
AffineExpr getAffineDimExpr(unsigned position)
MLIRContext * getContext() const
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteConfig & setStrictness(GreedyRewriteStrictness mode)
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
IRValueT get() const
Return the current value being used by this operand.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class represents a diagnostic that is inflight and set to be reported.
An integer set representing a conjunction of one or more affine equalities and inequalities.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents an operand of an operation.
Set of flags used to control the behavior of the various IR print methods (e.g.
A wrapper class that allows for printing an operation with a set of flags, useful to act as a "stream...
This class implements the operand iterators for the Operation class.
Operation is the basic unit of execution within MLIR.
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Location getLoc()
The source location the operation was defined or derived from.
Block * getBlock()
Returns the operation block that contains this operation.
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Region * getParentRegion()
Returns the region to which the instruction belongs.
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
bool isAncestor(Region *other)
Return true if this region is ancestor of the other region.
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
bool use_empty() const
Returns true if this value has no uses.
Type getType() const
Return the type of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Region * getParentRegion()
Return the Region in which this Value is defined.
static WalkResult advance()
AffineBound represents a lower or upper bound in the for operation.
Value getOperand(unsigned idx)
operand_range getOperands()
unsigned getNumOperands()
static AffineDmaStartOp create(OpBuilder &builder, Location location, Value srcMemRef, AffineMap srcMap, ValueRange srcIndices, Value destMemRef, AffineMap dstMap, ValueRange destIndices, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements, Value stride=nullptr, Value elementsPerStride=nullptr)
static AffineDmaWaitOp create(OpBuilder &builder, Location location, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements)
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
AffineMap getAffineMap() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Operation * getOwner() const
Return the owner of this operand.
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
unsigned getNumVars() const
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumDimAndSymbolVars() const
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, std::optional< Value > filterMemRef, DenseSet< Operation * > ©Nests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions ©Options, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
bool isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Count the number of loops surrounding operand such that operand could be hoisted above.
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
AffineForOp sinkSequentialLoops(AffineForOp forOp)
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, llvm::function_ref< bool(Operation *)> userFilterFn=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region ®ion)
Replace all uses of orig within the given region with replacement.
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
void generateUnrolledLoop(Block *loopBodyBlock, Value iv, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues, IRMapping *clonedToSrcOpsMap=nullptr)
Generate unrolled copies of an scf loop's 'loopBodyBlock', with 'iterArgs' and 'yieldedValues' as the...
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Result for calling generateCopyForMemRegion.
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
FlatAffineValueConstraints * getConstraints()
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Value memref
Memref that this region corresponds to.
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.