26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/DebugLog.h"
29 #include "llvm/Support/raw_ostream.h"
32 #define DEBUG_TYPE "loop-utils"
35 using namespace affine;
36 using namespace presburger;
37 using llvm::SmallMapVector;
58 auto lbMap = forOp.getLowerBoundMap();
59 auto lb = AffineApplyOp::create(b, forOp.getLoc(), lbMap,
60 forOp.getLowerBoundOperands());
69 int64_t step = forOp.getStepAsInt();
70 for (
unsigned i = 0, e = tripCountMap.
getNumResults(); i < e; i++) {
71 auto tripCountExpr = tripCountMap.
getResult(i);
72 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
76 AffineApplyOp::create(b, forOp.getLoc(), bumpMap, tripCountOperands);
80 for (
unsigned i = 0, e = bumpExprs.size(); i < e; i++)
83 cleanupLbOperands.clear();
84 cleanupLbOperands.push_back(lb);
85 cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
93 for (
auto v : bumpValues)
95 v.getDefiningOp()->erase();
105 auto iterOperands = forOp.getInits();
106 auto iterArgs = forOp.getRegionIterArgs();
107 for (
auto e : llvm::zip(iterOperands, iterArgs))
108 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
111 auto outerResults = forOp.getResults();
112 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
113 for (
auto e : llvm::zip(outerResults, innerResults))
114 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
121 if (!tripCount || *tripCount != 1)
125 if (forOp.getLowerBoundMap().getNumResults() != 1)
129 auto iv = forOp.getInductionVar();
130 auto *parentBlock = forOp->getBlock();
131 if (!iv.use_empty()) {
132 if (forOp.hasConstantLowerBound()) {
133 auto func = forOp->getParentOfType<FunctionOpInterface>();
136 builder.setInsertionPointToStart(&func.getFunctionBody().front());
138 builder.setInsertionPoint(forOp);
140 builder, forOp.getLoc(), forOp.getConstantLowerBound());
141 iv.replaceAllUsesWith(constOp);
143 auto lbOperands = forOp.getLowerBoundOperands();
144 auto lbMap = forOp.getLowerBoundMap();
148 iv.replaceAllUsesWith(lbOperands[0]);
151 AffineApplyOp::create(builder, forOp.getLoc(), lbMap, lbOperands);
152 iv.replaceAllUsesWith(affineApplyOp);
161 forOp.getBody()->back().erase();
163 forOp.getBody()->getOperations());
178 unsigned offset, AffineForOp srcForOp,
OpBuilder b) {
179 auto lbOperands = srcForOp.getLowerBoundOperands();
180 auto ubOperands = srcForOp.getUpperBoundOperands();
186 AffineForOp::create(b, srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
187 ubMap, srcForOp.getStepAsInt());
188 auto loopChunkIV = loopChunk.getInductionVar();
189 auto srcIV = srcForOp.getInductionVar();
194 for (
const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
195 uint64_t shift = it.first;
196 auto ops = it.second;
201 if (!srcIV.use_empty() && shift != 0) {
202 auto ivRemap = AffineApplyOp::create(
203 bodyBuilder, srcForOp.getLoc(),
204 bodyBuilder.getSingleDimShiftAffineMap(
205 -
static_cast<int64_t
>(srcForOp.getStepAsInt() * shift)),
207 operandMap.
map(srcIV, ivRemap);
209 operandMap.
map(srcIV, loopChunkIV);
212 bodyBuilder.clone(*op, operandMap);
215 return AffineForOp();
232 bool unrollPrologueEpilogue) {
233 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
234 "too few/many shifts");
235 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
243 if (!mayBeConstTripCount) {
244 LLVM_DEBUG(forOp.emitRemark(
"non-constant trip count loop not handled"));
247 uint64_t tripCount = *mayBeConstTripCount;
250 "shifts will lead to an invalid transformation\n");
252 int64_t step = forOp.getStepAsInt();
254 unsigned numChildOps = shifts.size();
257 uint64_t maxShift = *llvm::max_element(shifts);
258 if (maxShift >= numChildOps) {
260 forOp.emitWarning(
"not shifting because shifts are unrealistically large");
267 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
269 for (
auto &op : forOp.getBody()->without_terminator()) {
270 auto shift = shifts[pos++];
271 sortedOpGroups[shift].push_back(&op);
279 AffineForOp prologue, epilogue;
284 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
286 auto origLbMap = forOp.getLowerBoundMap();
287 uint64_t lbShift = 0;
289 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
291 if (sortedOpGroups[d].empty())
293 if (!opGroupQueue.empty()) {
295 "Queue expected to be empty when the first block is found");
300 if (lbShift + tripCount * step < d * step) {
304 opGroupQueue, 0, forOp, b);
306 opGroupQueue.clear();
307 lbShift += tripCount * step;
311 opGroupQueue, 0, forOp, b);
318 AffineForOp::getCanonicalizationPatterns(
patterns, res.getContext());
321 res.getOperation(), std::move(
patterns),
325 if (!erased && !prologue)
335 opGroupQueue.emplace_back(d, sortedOpGroups[d]);
340 for (
unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
341 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
344 opGroupQueue, i, forOp, b);
353 if (unrollPrologueEpilogue && prologue)
355 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
367 if (input.size() <= 1)
370 LDBG() <<
"Index set computation failed!";
374 LDBG() <<
"Non-hyperrectangular nests not supported for tiling!";
382 template <
typename t>
385 assert(input.size() == tileSizes.size() &&
"Too few/many tile sizes");
387 if (llvm::any_of(input,
388 [](AffineForOp op) {
return op.getNumResults() > 0; })) {
389 LDBG() <<
"Cannot tile nest where a loop has yield values";
395 LDBG() <<
"input loops not perfectly nested";
410 auto &ops = src.getBody()->getOperations();
411 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
412 std::prev(ops.end()));
424 AffineForOp rootAffineForOp,
unsigned width,
426 Location loc = rootAffineForOp.getLoc();
429 Operation *topLoop = rootAffineForOp.getOperation();
430 AffineForOp innermostPointLoop;
433 for (
unsigned i = 0; i < width; i++) {
436 AffineForOp pointLoop = AffineForOp::create(b, loc, 0, 0);
437 pointLoop.getBody()->getOperations().splice(
440 tiledLoops[2 * width - 1 - i] = pointLoop;
441 topLoop = pointLoop.getOperation();
443 innermostPointLoop = pointLoop;
447 for (
unsigned i = width; i < 2 * width; i++) {
450 AffineForOp tileSpaceLoop = AffineForOp::create(b, loc, 0, 0);
451 tileSpaceLoop.getBody()->getOperations().splice(
454 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
455 topLoop = tileSpaceLoop.getOperation();
465 AffineForOp newInterTileLoop,
466 AffineForOp newIntraTileLoop,
476 assert(origLoop.hasConstantLowerBound() &&
477 "expected input loops to have constant lower bound.");
499 lbOperands.push_back(newInterTileLoop.getInductionVar());
500 ubOperands.push_back(newInterTileLoop.getInductionVar());
514 lbOperands.push_back(tileSize);
515 ubOperands.push_back(tileSize);
527 lbBoundExprs.push_back(
528 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
536 ubBoundExprs.push_back(
537 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
538 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
540 ubBoundExprs.append(origUbMap.
getResults().begin(),
546 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
551 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
554 newIntraTileLoop.setStep(origLoop.getStepAsInt());
560 AffineForOp newLoop,
Value tileSize) {
561 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
565 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
576 assert(origLoop.hasConstantLowerBound() &&
577 "expected input loops to have constant lower bound.");
597 ubOperands.push_back(tileSize);
604 int64_t origUpperBound;
609 if (origLoop.hasConstantUpperBound()) {
610 origUpperBound = origLoop.getConstantUpperBound();
617 boundExprs.push_back(
619 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
640 boundExprs.push_back(
642 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
648 newLoop.setUpperBound(ubOperands, ubMap);
651 newLoop.setStep(origLoop.getStepAsInt());
663 assert(!origLoops.empty() &&
"expected atleast one loop in band");
664 assert(origLoops.size() == tileSizes.size() &&
665 "expected tiling parameter for each loop in band.");
667 OpBuilder b(origLoops[0].getOperation());
668 unsigned width = origLoops.size();
671 for (
unsigned i = 0; i < width; ++i) {
676 for (
unsigned i = 0; i < width; ++i) {
678 newLoops[i + width], tileSizes[i]);
691 assert(!origLoops.empty());
692 assert(origLoops.size() == tileSizes.size());
694 OpBuilder b(origLoops[0].getOperation());
695 unsigned width = origLoops.size();
698 for (
unsigned i = 0; i < width; i++) {
699 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
700 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
701 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
702 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
705 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
708 for (
unsigned i = 0; i < width; i++) {
710 std::optional<uint64_t> mayBeConstantCount =
714 newLoops[width + i].setLowerBound(
715 newLoops[i].getInductionVar(), lbMap);
717 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
720 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
724 *mayBeConstantCount * origLoops[i].getStepAsInt());
725 newLoops[width + i].setUpperBound(
726 newLoops[i].getInductionVar(), ubMap);
727 }
else if (largestDiv % tileSizes[i] != 0) {
743 ubOperands.push_back(newLoops[i].getInductionVar());
754 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
755 boundExprs.append(origUbMap.
getResults().begin(),
760 newLoops[width + i].setUpperBound(ubOperands, ubMap);
765 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
766 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
782 AffineForOp rootAffineForOp = origLoops[0];
785 unsigned width = input.size();
799 for (
unsigned i = 0; i < width; i++)
800 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
803 rootAffineForOp.erase();
806 *tiledNest = std::move(tiledLoops);
824 AffineForOp rootAffineForOp = origLoops[0];
825 unsigned width = input.size();
840 for (
unsigned i = 0; i < width; i++)
841 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
844 rootAffineForOp.erase();
847 *tiledNest = std::move(tiledLoops);
859 nestedLoops.push_back(root);
861 if (body.
begin() != std::prev(body.
end(), 2))
864 root = dyn_cast<AffineForOp>(&body.
front());
873 if (mayBeConstantTripCount.has_value()) {
874 uint64_t tripCount = *mayBeConstantTripCount;
887 uint64_t unrollFactor) {
889 if (mayBeConstantTripCount.has_value() &&
890 *mayBeConstantTripCount < unrollFactor)
900 Block *loopBodyBlock,
Value forOpIV, uint64_t unrollFactor,
910 annotateFn = defaultAnnotateFn;
919 for (
unsigned i = 1; i < unrollFactor; i++) {
923 operandMap.
map(iterArgs, lastYielded);
928 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
929 operandMap.
map(forOpIV, ivUnroll);
933 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++) {
935 annotateFn(i, clonedOp, builder);
942 for (
unsigned i = 0, e = lastYielded.size(); i < e; i++) {
943 Operation *defOp = yieldedValues[i].getDefiningOp();
944 if (defOp && defOp->
getBlock() == loopBodyBlock)
945 lastYielded[i] = operandMap.
lookup(yieldedValues[i]);
951 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++)
952 annotateFn(0, &*it, builder);
961 uint64_t unrollFactor) {
964 auto cleanupForOp = cast<AffineForOp>(builder.
clone(*forOp));
968 auto results = forOp.getResults();
969 auto cleanupResults = cleanupForOp.getResults();
970 auto cleanupIterOperands = cleanupForOp.getInits();
972 for (
auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
973 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
974 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
983 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
989 forOp.setUpperBound(cleanupOperands, cleanupMap);
996 AffineForOp forOp, uint64_t unrollFactor,
998 bool cleanUpUnroll) {
999 assert(unrollFactor > 0 &&
"unroll factor should be positive");
1002 if (unrollFactor == 1) {
1009 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1013 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1014 if (cleanUpUnroll) {
1028 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1029 forOp.getUpperBoundMap().getNumResults() != 1)
1035 assert(
false &&
"cleanup loop lower bound map for single result lower "
1036 "and upper bound maps can always be determined");
1039 ValueRange iterArgs(forOp.getRegionIterArgs());
1040 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1043 int64_t step = forOp.getStepAsInt();
1044 forOp.setStep(step * unrollFactor);
1046 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1049 auto d0 = b.getAffineDimExpr(0);
1050 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1051 return AffineApplyOp::create(b, forOp.getLoc(), bumpMap, iv);
1054 iterArgs, yieldedValues);
1062 uint64_t unrollJamFactor) {
1064 if (mayBeConstantTripCount.has_value() &&
1065 *mayBeConstantTripCount < unrollJamFactor)
1073 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1074 for (
auto controlOperand : aForOp.getControlOperands()) {
1075 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1076 return WalkResult::interrupt();
1080 return !walkResult.wasInterrupted();
1085 uint64_t unrollJamFactor) {
1086 assert(unrollJamFactor > 0 &&
"unroll jam factor should be positive");
1089 if (unrollJamFactor == 1) {
1096 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1100 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1101 LDBG() <<
"[failed] trip count < unroll-jam factor";
1117 forOp.walk([&](AffineForOp aForOp) {
1118 if (aForOp.getNumIterOperands() > 0)
1119 loopsWithIterArgs.push_back(aForOp);
1124 if (forOp.getNumIterOperands() > 0)
1134 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1135 forOp.getUpperBoundMap().getNumResults() != 1)
1138 assert(
false &&
"cleanup loop lower bound map for single result lower "
1139 "and upper bound maps can always be determined");
1151 for (AffineForOp oldForOp : loopsWithIterArgs) {
1153 ValueRange oldIterOperands = oldForOp.getInits();
1154 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1156 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1159 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1160 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1161 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1165 bool forOpReplaced = oldForOp == forOp;
1166 AffineForOp newForOp =
1167 cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1168 rewriter, dupIterOperands,
false,
1170 return dupYieldOperands;
1172 newLoopsWithIterArgs.push_back(newForOp);
1177 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1178 unsigned oldNumIterArgs = oldIterArgs.size();
1179 ValueRange newResults = newForOp.getResults();
1180 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1181 assert(oldNumIterArgs == oldNumResults &&
1182 "oldNumIterArgs must be the same as oldNumResults");
1183 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1184 for (
unsigned j = 0;
j < oldNumIterArgs; ++
j) {
1188 operandMaps[i - 1].map(newIterArgs[
j],
1189 newIterArgs[i * oldNumIterArgs +
j]);
1190 operandMaps[i - 1].map(newResults[
j],
1191 newResults[i * oldNumResults +
j]);
1197 int64_t step = forOp.getStepAsInt();
1198 forOp.setStep(step * unrollJamFactor);
1200 auto forOpIV = forOp.getInductionVar();
1202 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1203 for (
auto &subBlock : subBlocks) {
1206 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1215 AffineApplyOp::create(builder, forOp.getLoc(), bumpMap, forOpIV);
1216 operandMaps[i - 1].map(forOpIV, ivUnroll);
1219 for (
auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1220 builder.
clone(*it, operandMaps[i - 1]);
1223 for (
auto newForOp : newLoopsWithIterArgs) {
1224 unsigned oldNumIterOperands =
1225 newForOp.getNumIterOperands() / unrollJamFactor;
1226 unsigned numControlOperands = newForOp.getNumControlOperands();
1227 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1228 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1229 assert(oldNumIterOperands == oldNumYieldOperands &&
1230 "oldNumIterOperands must be the same as oldNumYieldOperands");
1231 for (
unsigned j = 0;
j < oldNumIterOperands; ++
j) {
1235 newForOp.setOperand(numControlOperands + i * oldNumIterOperands +
j,
1236 operandMaps[i - 1].lookupOrDefault(
1237 newForOp.getOperand(numControlOperands +
j)));
1239 i * oldNumYieldOperands +
j,
1240 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(
j)));
1244 if (forOp.getNumResults() > 0) {
1250 auto loc = forOp.getLoc();
1251 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1253 unsigned pos = reduction.iterArgPosition;
1254 Value lhs = forOp.getResult(pos);
1257 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1258 rhs = forOp.getResult(i * oldNumResults + pos);
1264 assert(op &&
"Reduction op should have been created");
1268 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1280 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1281 auto &forOpABody = forOpA.getBody()->getOperations();
1282 auto &forOpBBody = forOpB.getBody()->getOperations();
1288 forOpABody, forOpABody.begin(),
1289 std::prev(forOpABody.end()));
1292 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1293 std::prev(forOpBBody.end()));
1295 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1306 unsigned maxLoopDepth = loops.size();
1308 loopPermMapInv.resize(maxLoopDepth);
1309 for (
unsigned i = 0; i < maxLoopDepth; ++i)
1310 loopPermMapInv[loopPermMap[i]] = i;
1317 for (
const auto &depComps : depCompsVec) {
1318 assert(depComps.size() >= maxLoopDepth);
1321 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1322 unsigned permIndex = loopPermMapInv[
j];
1323 assert(depComps[permIndex].lb);
1324 int64_t depCompLb = *depComps[permIndex].lb;
1338 assert(loopPermMap.size() == loops.size() &&
"invalid loop perm map");
1339 unsigned maxLoopDepth = loops.size();
1340 if (maxLoopDepth == 1)
1346 if (llvm::any_of(loops, [](AffineForOp loop) {
1347 return loop.getNumIterOperands() > 0;
1353 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1360 bool LLVM_ATTRIBUTE_UNUSED
1362 assert(!loops.empty() &&
"no loops provided");
1365 auto hasTwoElements = [](
Block *block) {
1366 auto secondOpIt = std::next(block->begin());
1367 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1370 auto enclosingLoop = loops.front();
1371 for (
auto loop : loops.drop_front()) {
1372 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1374 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1376 enclosingLoop = loop;
1385 assert(input.size() == permMap.size() &&
"invalid permutation map size");
1389 llvm::sort(checkPermMap);
1391 [](
const auto &en) {
return en.value() != en.index(); }))
1392 assert(
false &&
"invalid permutation map");
1395 if (input.size() < 2)
1403 for (
unsigned i = 0, e = input.size(); i < e; ++i)
1404 invPermMap.push_back({permMap[i], i});
1405 llvm::sort(invPermMap);
1409 if (permMap.back() != input.size() - 1) {
1410 Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1411 Block *srcBody = ((AffineForOp)input.back()).getBody();
1414 std::prev(srcBody->
end()));
1419 for (
int i = input.size() - 1; i >= 0; --i) {
1422 if (permMap[i] == 0) {
1427 auto *parentBlock = input[0]->getBlock();
1429 input[i]->getBlock()->getOperations(),
1436 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1437 if (i > 0 &&
static_cast<unsigned>(i - 1) == parentPosInInput)
1441 auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1442 destBody->getOperations().splice(destBody->begin(),
1443 input[i]->getBlock()->getOperations(),
1447 return invPermMap[0].second;
1456 if (loops.size() < 2)
1461 unsigned maxLoopDepth = loops.size();
1462 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1467 for (
auto &depComps : depCompsVec) {
1468 assert(depComps.size() >= maxLoopDepth);
1469 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1471 assert(depComp.
lb.has_value() && depComp.
ub.has_value());
1472 if (*depComp.
lb != 0 || *depComp.
ub != 0)
1482 unsigned nextSequentialLoop = numParallelLoops;
1483 unsigned nextParallelLoop = 0;
1484 for (
unsigned i = 0; i < maxLoopDepth; ++i) {
1486 loopPermMap[i] = nextParallelLoop++;
1488 loopPermMap[i] = nextSequentialLoop++;
1496 unsigned loopNestRootIndex =
permuteLoops(loops, loopPermMap);
1497 return loops[loopNestRootIndex];
1511 int64_t offset = 0) {
1512 auto bounds = llvm::to_vector<4>(map->
getResults());
1514 operands->insert(operands->begin() + map->
getNumDims(), iv);
1531 auto originalStep = forOp.getStepAsInt();
1532 auto scaledStep = originalStep * factor;
1533 forOp.setStep(scaledStep);
1538 auto lbMap = forOp.getLowerBoundMap();
1543 auto ubMap = forOp.getUpperBoundMap();
1548 auto iv = forOp.getInductionVar();
1550 for (
auto t : targets) {
1553 auto newForOp = AffineForOp::create(b, t.getLoc(), lbOperands, lbMap,
1554 ubOperands, ubMap, originalStep);
1555 auto begin = t.getBody()->begin();
1557 auto nOps = t.getBody()->getOperations().size() - 2;
1558 newForOp.getBody()->getOperations().splice(
1559 newForOp.getBody()->getOperations().begin(),
1560 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1562 newForOp.getRegion());
1563 innerLoops.push_back(newForOp);
1571 template <
typename SizeType>
1573 AffineForOp target) {
1579 assert(res.size() == 1 &&
"Expected 1 inner forOp");
1588 for (
auto it : llvm::zip(forOps, sizes)) {
1589 auto step =
stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1590 res.push_back(step);
1591 currentTargets = step;
1598 AffineForOp target) {
1601 res.push_back(llvm::getSingleElement(loops));
1606 if (loops.size() < 2)
1609 AffineForOp innermost = loops.back();
1610 AffineForOp outermost = loops.front();
1615 for (AffineForOp loop : loops) {
1617 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1618 loop.getConstantLowerBound() != 0)
1627 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1628 prev = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1630 prev = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1631 upperBoundSymbols.push_back(prev);
1635 for (AffineForOp loop : loops.drop_front()) {
1636 ub = loop.getUpperBound();
1641 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1642 upperBound = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1644 upperBound = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1645 upperBoundSymbols.push_back(upperBound);
1647 operands.push_back(prev);
1648 operands.push_back(upperBound);
1650 prev = AffineApplyOp::create(
1662 outermost.setUpperBound(prev, newUbMap);
1674 Value previous = outermost.getInductionVar();
1675 for (
unsigned idx = loops.size(); idx > 0; --idx) {
1676 if (idx != loops.size()) {
1678 operands.push_back(previous);
1679 operands.push_back(upperBoundSymbols[idx]);
1680 previous = AffineApplyOp::create(builder, loc,
1689 Value inductionVariable;
1691 inductionVariable = previous;
1694 applyOperands.push_back(previous);
1695 applyOperands.push_back(upperBoundSymbols[idx - 1]);
1696 inductionVariable = AffineApplyOp::create(
1704 inductionVariable, loops.back().getRegion());
1709 AffineForOp secondOutermostLoop = loops[1];
1710 innermost.getBody()->back().erase();
1711 outermost.getBody()->getOperations().splice(
1713 innermost.getBody()->getOperations());
1714 secondOutermostLoop.erase();
1721 assert(processorId.size() == numProcessors.size());
1722 if (processorId.empty())
1732 Value linearIndex = processorId.front();
1733 for (
unsigned i = 1, e = processorId.size(); i < e; ++i) {
1734 auto mulApplyOp = AffineApplyOp::create(
1735 b, loc, mulMap,
ValueRange{linearIndex, numProcessors[i]});
1736 linearIndex = AffineApplyOp::create(b, loc, addMap,
1740 auto mulApplyOp = AffineApplyOp::create(
1741 b, loc, mulMap,
ValueRange{linearIndex, forOp.getStep()});
1742 Value lb = AffineApplyOp::create(
1743 b, loc, addMap,
ValueRange{mulApplyOp, forOp.getLowerBound()});
1744 forOp.setLowerBound(lb);
1746 Value step = forOp.getStep();
1747 for (
auto numProcs : numProcessors)
1748 step = AffineApplyOp::create(b, loc, mulMap,
ValueRange{numProcs, step});
1749 forOp.setStep(step);
1760 Block **copyPlacementBlock,
1765 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1771 auto it = enclosingAffineOps.rbegin();
1772 AffineForOp lastInvariantFor;
1773 for (
auto e = enclosingAffineOps.rend(); it != e; ++it) {
1778 LDBG() <<
"memref definition will end up not dominating hoist location";
1782 auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1787 if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1789 lastInvariantFor = affineFor;
1792 if (it != enclosingAffineOps.rbegin()) {
1794 *copyOutPlacementStart = std::next(*copyInPlacementStart);
1795 *copyPlacementBlock = lastInvariantFor->getBlock();
1797 *copyInPlacementStart = begin;
1798 *copyOutPlacementStart = end;
1799 *copyPlacementBlock = █
1817 if (bufferShape.size() <= 1)
1820 int64_t numEltPerStride = 1;
1822 for (
int d = bufferShape.size() - 1; d >= 1; d--) {
1823 int64_t dimSize = cast<MemRefType>(region.
memref.
getType()).getDimSize(d);
1825 numEltPerStride *= bufferShape[d];
1829 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1830 strideInfos->push_back({stride, numEltPerStride});
1855 assert(llvm::all_of(lbMaps, [&](
AffineMap lbMap) {
1858 assert(llvm::all_of(ubMaps, [&](
AffineMap ubMap) {
1862 unsigned rank = cast<MemRefType>(memref.
getType()).getRank();
1864 assert(rank != 0 &&
"non-zero rank memref expected");
1865 assert(lbMaps.size() == rank &&
"wrong number of lb maps");
1866 assert(ubMaps.size() == rank &&
"wrong number of ub maps");
1871 AffineForOp copyNestRoot;
1873 for (
unsigned d = 0; d < rank; ++d) {
1875 ubOperands, ubMaps[d]);
1877 copyNestRoot = forOp;
1881 auto fastBufOffsetMap =
1883 auto offset = AffineApplyOp::create(b, loc, fastBufOffsetMap, lbOperands);
1889 fastBufMapOperands.push_back(offset);
1890 fastBufMapOperands.push_back(forOp.getInductionVar());
1891 mayBeDeadApplys.push_back(offset);
1894 memIndices.push_back(forOp.getInductionVar());
1904 for (
auto applyOp : mayBeDeadApplys)
1905 if (applyOp.use_empty())
1910 auto load = AffineLoadOp::create(b, loc, memref, memIndices);
1911 AffineStoreOp::create(b, loc, load, fastMemRef, fastBufMap,
1912 fastBufMapOperands);
1913 return copyNestRoot;
1918 AffineLoadOp::create(b, loc, fastMemRef, fastBufMap, fastBufMapOperands);
1919 AffineStoreOp::create(b, loc, load, memref, memIndices);
1920 return copyNestRoot;
1952 auto f = begin->getParentOfType<FunctionOpInterface>();
1953 OpBuilder topBuilder(f.getFunctionBody());
1965 Operation *lastCopyOp = end->getPrevNode();
1969 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1972 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1974 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1982 auto loc = region.
loc;
1983 auto memref = region.
memref;
1984 auto memRefType = cast<MemRefType>(memref.getType());
1986 if (!memRefType.getLayout().isIdentity()) {
1987 LDBG() <<
"Non-identity layout map not yet supported";
1997 unsigned rank = memRefType.getRank();
1999 LDBG() <<
"Non-zero ranked memrefs supported";
2008 std::optional<int64_t> numElements =
2011 LDBG() <<
"Non-constant region size not supported";
2017 LDBG() <<
"Max lower bound for memref region start not supported";
2021 if (*numElements == 0) {
2022 LDBG() <<
"Nothing to copy";
2027 for (
unsigned i = 0; i < rank; ++i) {
2029 if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2030 LDBG() <<
"Missing lower or upper bound for region along dimension: "
2050 fastBufOffsets.reserve(rank);
2051 for (
unsigned d = 0; d < rank; d++) {
2052 assert(lbs[d].getNumSymbols() == cst->
getNumCols() - rank - 1 &&
2053 "incorrect bound size");
2057 if (lbs[d].isSingleConstant()) {
2058 auto indexVal = lbs[d].getSingleConstantResult();
2059 if (indexVal == 0) {
2060 memIndices.push_back(zeroIndex);
2062 memIndices.push_back(
2072 for (
unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2073 symReplacements[i] = top.getAffineDimExpr(i);
2074 lbs[d] = lbs[d].replaceDimsAndSymbols(
2075 {}, symReplacements, lbs[d].getNumSymbols(),
2077 memIndices.push_back(
2078 AffineApplyOp::create(b, loc, lbs[d], regionSymbols));
2081 bufIndices.push_back(zeroIndex);
2085 fastBufOffsets.push_back(lbs[d].getResult(0));
2092 bool existingBuf = fastBufferMap.count(memref) > 0;
2095 auto fastMemRefType =
2102 memref::AllocOp::create(prologue, loc, fastMemRefType).getResult();
2104 fastBufferMap[memref] = fastMemRef;
2108 *sizeInBytes = maySizeInBytes.value_or(0);
2111 <<
"Creating fast buffer of type " << fastMemRefType
2116 fastMemRef = fastBufferMap[memref];
2122 Value numEltPerDmaStride;
2129 if (dmaStrideInfos.size() > 1) {
2130 LDBG() <<
"Only up to one level of stride supported";
2134 if (!dmaStrideInfos.empty()) {
2138 top, loc, dmaStrideInfos[0].numEltPerStride);
2152 regionSymbols, ubMaps,
2153 regionSymbols, fastBufOffsets,
2157 copyNests.insert(copyNest);
2161 if (region.
isWrite() && isCopyOutAtEndOfBlock)
2168 auto tagMemRef = memref::AllocOp::create(prologue, loc, tagMemRefType);
2176 fastMemRef, bufAffineMap, bufIndices, tagMemRef,
2177 tagAffineMap, tagIndices, numElementsSSA,
2178 dmaStride, numEltPerDmaStride);
2182 b, loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2183 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2184 dmaStride, numEltPerDmaStride);
2187 if (isCopyOutAtEndOfBlock)
2196 auto tagDeallocOp = memref::DeallocOp::create(epilogue, loc, tagMemRef);
2197 if (*nEnd == end && isCopyOutAtEndOfBlock)
2205 auto bufDeallocOp = memref::DeallocOp::create(epilogue, loc, fastMemRef);
2208 if (!copyOptions.
generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2220 remapExprs.reserve(rank);
2221 for (
unsigned i = 0; i < rank; i++) {
2226 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2228 auto indexRemap =
AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2233 bool isBeginAtStartOfBlock = (begin == block->
begin());
2234 if (!isBeginAtStartOfBlock)
2235 prevOfBegin = std::prev(begin);
2237 auto userFilterFn = [&](
Operation *user) {
2249 *nBegin = isBeginAtStartOfBlock ? block->
begin() : std::next(prevOfBegin);
2261 if (
auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2262 rank = loadOp.getMemRefType().getRank();
2263 region->
memref = loadOp.getMemRef();
2265 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2266 rank = storeOp.getMemRefType().getRank();
2267 region->
memref = storeOp.getMemRef();
2270 assert(
false &&
"expected load or store op");
2273 auto memRefType = cast<MemRefType>(region->
memref.
getType());
2274 if (!memRefType.hasStaticShape())
2283 ivs.resize(numParamLoopIVs);
2287 regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2290 for (
unsigned d = 0; d < rank; d++) {
2291 auto dimSize = memRefType.getDimSize(d);
2292 assert(dimSize > 0 &&
"filtered dynamic shapes above");
2293 regionCst->addBound(BoundType::LB, d, 0);
2294 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2302 std::optional<Value> filterMemRef,
2307 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2308 "Inconsistent block begin/end args");
2309 assert(end != end->getBlock()->end() &&
"end can't be the block terminator");
2311 Block *block = begin->getBlock();
2317 LDBG() <<
"Generating copies at depth " << copyDepth;
2318 LDBG() <<
"from begin: "
2320 LDBG() <<
"to inclusive end: "
2326 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2327 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2339 MemRefType memrefType;
2341 if (
auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2342 memref = loadOp.getMemRef();
2343 memrefType = loadOp.getMemRefType();
2344 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2345 memref = storeOp.getMemRef();
2346 memrefType = storeOp.getMemRefType();
2352 if ((filterMemRef.has_value() && filterMemRef != memref) ||
2353 (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2358 LDBG() <<
"memref definition is inside of the depth at "
2359 <<
"which copy-in/copy-out would happen";
2364 auto region = std::make_unique<MemRefRegion>(opInst->
getLoc());
2365 if (
failed(region->compute(opInst, copyDepth,
nullptr,
2367 LDBG() <<
"Error obtaining memory region: semi-affine maps?";
2368 LDBG() <<
"over-approximating to the entire memref";
2369 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2370 LDBG() <<
"non-constant memref sizes not yet supported";
2391 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2393 const auto *
const it = targetRegions.find(region->memref);
2394 if (it == targetRegions.end())
2398 if (
failed(it->second->unionBoundingBox(*region))) {
2399 LDBG() <<
"Memory region bounding box failed; "
2400 <<
"over-approximating to the entire memref";
2403 LDBG() <<
"non-constant memref sizes not yet supported";
2407 it->second->getConstraints()->clearAndCopyFrom(
2408 *region->getConstraints());
2411 region->getConstraints()->clearAndCopyFrom(
2412 *it->second->getConstraints());
2417 bool existsInRead = updateRegion(readRegions);
2420 bool existsInWrite = updateRegion(writeRegions);
2425 if (region->isWrite() && !existsInWrite) {
2426 writeRegions[region->memref] = std::move(region);
2427 }
else if (!region->isWrite() && !existsInRead) {
2428 readRegions[region->memref] = std::move(region);
2433 LDBG() <<
"copy generation failed for one or more memref's in this block";
2437 uint64_t totalCopyBuffersSizeInBytes = 0;
2439 auto processRegions =
2440 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2442 for (
const auto ®ionEntry : regions) {
2446 Block *copyPlacementBlock;
2448 *regionEntry.second, *block, begin, end, ©PlacementBlock,
2449 ©InPlacementStart, ©OutPlacementStart);
2451 uint64_t sizeInBytes;
2454 *regionEntry.second, block, begin, end, copyPlacementBlock,
2455 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2456 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2457 if (succeeded(iRet)) {
2461 totalCopyBuffersSizeInBytes += sizeInBytes;
2463 ret = ret & succeeded(iRet);
2466 processRegions(readRegions);
2467 processRegions(writeRegions);
2470 LDBG() <<
"copy generation failed for one or more memref's in this block";
2476 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2477 LLVM_DEBUG(forOp.emitRemark()
2479 <<
" KiB of copy buffers in fast memory space for this block");
2482 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2483 block->getParentOp()->emitWarning(
2484 "total size of all copy buffers' for this block exceeds fast memory "
2497 std::prev(forOp.getBody()->end()), copyOptions,
2498 filterMemRef, copyNests);
2505 auto begin = analyzedOp->getIterator();
2506 auto end = std::next(begin);
2510 auto err =
generateCopy(memrefRegion, block, begin, end, block, begin, end,
2511 copyOptions, fastBufferMap, copyNests,
2516 const auto &en = fastBufferMap.find(memrefRegion.
memref);
2518 if (en == fastBufferMap.end())
2520 result.
alloc = en->second.getDefiningOp();
2521 assert(result.
alloc &&
"fast buffer expected to be locally allocated");
2522 assert(copyNests.size() <= 1 &&
"At most one copy nest is expected.");
2523 result.
copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2532 assert(currLoopDepth <= depthToLoops.size() &&
"Unexpected currLoopDepth");
2533 if (currLoopDepth == depthToLoops.size())
2534 depthToLoops.emplace_back();
2536 for (
auto &op : *block) {
2537 if (
auto forOp = dyn_cast<AffineForOp>(op)) {
2538 depthToLoops[currLoopDepth].push_back(forOp);
2547 for (
auto &block : func)
2551 if (!depthToLoops.empty()) {
2552 assert(depthToLoops.back().empty() &&
"Last loop level is not empty?");
2553 depthToLoops.pop_back();
2570 return AffineForOp::create(b, loc, lowerOperands, lbMap, upperOperands, ubMap,
2584 auto *context = loops[0].getContext();
2588 llvm::append_range(ops, loops);
2598 for (
auto loop : loops) {
2601 assert(loop.getStepAsInt() == 1 &&
"point loop step expected to be one");
2605 unsigned fullTileLbPos, fullTileUbPos;
2607 .getConstantBoundOnDimSize(0,
nullptr,
2609 nullptr, &fullTileLbPos,
2611 LDBG() <<
"Can't get constant diff pair for a loop";
2620 fullTileLb.assign(fLb.begin(), fLb.end());
2621 fullTileUb.assign(fUb.begin(), fUb.end());
2624 for (
auto lbIndex : lbIndices)
2625 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2626 cst.
atIneq(lbIndex, i) = fullTileLb[i] - cst.
atIneq(lbIndex, i);
2629 for (
auto ubIndex : ubIndices)
2630 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2631 cst.
atIneq(ubIndex, i) -= fullTileUb[i];
2654 return AffineIfOp::create(b, loops[0].getLoc(), ifCondSet, setOperands,
2659 static LogicalResult
2662 fullTileLoops.reserve(inputNest.size());
2667 for (
auto loop : inputNest) {
2669 if (loop.getStepAsInt() != 1) {
2670 LDBG() <<
"[tile separation] non-unit stride not implemented";
2678 unsigned lbPos, ubPos;
2680 .getConstantBoundOnDimSize(0,
nullptr,
2682 nullptr, &lbPos, &ubPos) ||
2684 LDBG() <<
"[tile separation] Can't get constant diff / "
2685 <<
"equalities not yet handled";
2694 cst.getIneqAsAffineValueMap(0, lbPos, lbVmap, b.
getContext());
2695 cst.getIneqAsAffineValueMap(0, ubPos, ubVmap, b.
getContext());
2700 fullTileLoops.push_back(fullTileLoop);
2706 operandMap.
map(loopEn.value().getInductionVar(),
2707 fullTileLoops[loopEn.index()].getInductionVar());
2709 for (
auto &op : inputNest.back().getBody()->without_terminator())
2710 b.
clone(op, operandMap);
2717 if (inputNest.empty())
2720 auto firstLoop = inputNest[0];
2723 auto prevLoop = firstLoop;
2724 for (
auto loop : inputNest.drop_front(1)) {
2725 assert(loop->getParentOp() == prevLoop &&
"input not contiguously nested");
2733 if (!fullTileLoops.empty())
2734 fullTileLoops.front().erase();
2742 fullTileLoops.front().erase();
2743 LDBG() <<
"All tiles are full tiles, or failure creating "
2744 <<
"separation condition";
2749 Block *thenBlock = ifOp.getThenBlock();
2750 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2752 std::prev(thenBlock->
end()),
2753 outermostFullTileLoop->getBlock()->getOperations(),
2758 Block *elseBlock = ifOp.getElseBlock();
2760 firstLoop->getBlock()->getOperations(),
2764 *fullTileNest = std::move(fullTileLoops);
2770 LogicalResult result(failure());
2773 if (loops.size() <= 1)
2781 for (
unsigned i = 0, e = loops.size(); i < e; ++i) {
2782 operandsDefinedAbove[i] = i;
2783 for (
unsigned j = 0;
j < i; ++
j) {
2785 operandsDefinedAbove[i] =
j;
2794 for (
unsigned end = loops.size(); end > 0; --end) {
2796 for (; start < end - 1; ++start) {
2798 *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2799 std::next(operandsDefinedAbove.begin(), end));
2802 assert(maxPos == start &&
2803 "expected loop bounds to be known at the start of the band");
2811 if (start != end - 1)
2820 while (
auto loopOp = currentOp->
getParentOfType<LoopLikeOpInterface>()) {
2821 if (!loopOp.isDefinedOutsideOfLoop(operand.
get()))
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
static void findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
static LogicalResult generateCopy(const MemRefRegion ®ion, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions ©Options, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > ©Nests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region (memref has to be non-zer...
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
static void getMultiLevelStrides(const MemRefRegion ®ion, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to a non-zero ranked ‘memref’ to/from ‘fastMemRef’ and returns the o...
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
unsigned getNumDims() const
ArrayRef< AffineExpr > getResults() const
unsigned getNumResults() const
unsigned getNumInputs() const
AffineExpr getResult(unsigned idx) const
Block represents an ordered list of Operations.
OpListType::iterator iterator
Operation * findAncestorOpInBlock(Operation &op)
Returns 'op' if 'op' lies in this block, or otherwise finds the ancestor operation of 'op' that lies ...
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Operation * getTerminator()
Get the terminator operation of this block.
OpListType & getOperations()
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
AffineMap getDimIdentityMap()
AffineMap getMultiDimIdentityMap(unsigned rank)
AffineExpr getAffineSymbolExpr(unsigned position)
AffineExpr getAffineConstantExpr(int64_t constant)
AffineExpr getAffineDimExpr(unsigned position)
MLIRContext * getContext() const
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteConfig & setStrictness(GreedyRewriteStrictness mode)
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
IRValueT get() const
Return the current value being used by this operand.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class represents a diagnostic that is inflight and set to be reported.
An integer set representing a conjunction of one or more affine equalities and inequalities.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents an operand of an operation.
Set of flags used to control the behavior of the various IR print methods (e.g.
A wrapper class that allows for printing an operation with a set of flags, useful to act as a "stream...
This class implements the operand iterators for the Operation class.
Operation is the basic unit of execution within MLIR.
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Location getLoc()
The source location the operation was defined or derived from.
Block * getBlock()
Returns the operation block that contains this operation.
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Region * getParentRegion()
Returns the region to which the instruction belongs.
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
bool isAncestor(Region *other)
Return true if this region is ancestor of the other region.
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
bool use_empty() const
Returns true if this value has no uses.
Type getType() const
Return the type of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Region * getParentRegion()
Return the Region in which this Value is defined.
static WalkResult advance()
AffineBound represents a lower or upper bound in the for operation.
Value getOperand(unsigned idx)
operand_range getOperands()
unsigned getNumOperands()
static AffineDmaStartOp create(OpBuilder &builder, Location location, Value srcMemRef, AffineMap srcMap, ValueRange srcIndices, Value destMemRef, AffineMap dstMap, ValueRange destIndices, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements, Value stride=nullptr, Value elementsPerStride=nullptr)
static AffineDmaWaitOp create(OpBuilder &builder, Location location, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements)
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
AffineMap getAffineMap() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Operation * getOwner() const
Return the owner of this operand.
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
unsigned getNumVars() const
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumDimAndSymbolVars() const
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, std::optional< Value > filterMemRef, DenseSet< Operation * > ©Nests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions ©Options, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Count the number of loops surrounding operand such that operand could be hoisted above.
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
AffineForOp sinkSequentialLoops(AffineForOp forOp)
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, llvm::function_ref< bool(Operation *)> userFilterFn=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region ®ion)
Replace all uses of orig within the given region with replacement.
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Result for calling generateCopyForMemRegion.
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
FlatAffineValueConstraints * getConstraints()
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Value memref
Memref that this region corresponds to.
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.