25 #include "llvm/ADT/MapVector.h"
26 #include "llvm/ADT/SmallPtrSet.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
31 #define DEBUG_TYPE "loop-utils"
34 using namespace affine;
35 using namespace presburger;
36 using llvm::SmallMapVector;
57 auto lbMap = forOp.getLowerBoundMap();
58 auto lb = b.
create<AffineApplyOp>(forOp.getLoc(), lbMap,
59 forOp.getLowerBoundOperands());
68 int64_t step = forOp.getStepAsInt();
69 for (
unsigned i = 0, e = tripCountMap.
getNumResults(); i < e; i++) {
70 auto tripCountExpr = tripCountMap.
getResult(i);
71 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
75 b.
create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
79 for (
unsigned i = 0, e = bumpExprs.size(); i < e; i++)
82 cleanupLbOperands.clear();
83 cleanupLbOperands.push_back(lb);
84 cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
92 for (
auto v : bumpValues)
94 v.getDefiningOp()->erase();
104 auto iterOperands = forOp.getInits();
105 auto iterArgs = forOp.getRegionIterArgs();
106 for (
auto e : llvm::zip(iterOperands, iterArgs))
107 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
110 auto outerResults = forOp.getResults();
111 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
112 for (
auto e : llvm::zip(outerResults, innerResults))
113 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
120 if (!tripCount || *tripCount != 1)
124 if (forOp.getLowerBoundMap().getNumResults() != 1)
128 auto iv = forOp.getInductionVar();
129 auto *parentBlock = forOp->getBlock();
130 if (!iv.use_empty()) {
131 if (forOp.hasConstantLowerBound()) {
132 OpBuilder topBuilder(forOp->getParentOfType<func::FuncOp>().getBody());
134 forOp.getLoc(), forOp.getConstantLowerBound());
135 iv.replaceAllUsesWith(constOp);
137 auto lbOperands = forOp.getLowerBoundOperands();
138 auto lbMap = forOp.getLowerBoundMap();
142 iv.replaceAllUsesWith(lbOperands[0]);
145 builder.
create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
155 forOp.getBody()->back().erase();
157 forOp.getBody()->getOperations());
172 unsigned offset, AffineForOp srcForOp,
OpBuilder b) {
173 auto lbOperands = srcForOp.getLowerBoundOperands();
174 auto ubOperands = srcForOp.getUpperBoundOperands();
180 b.
create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
181 ubMap, srcForOp.getStepAsInt());
182 auto loopChunkIV = loopChunk.getInductionVar();
183 auto srcIV = srcForOp.getInductionVar();
188 for (
const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
189 uint64_t shift = it.first;
190 auto ops = it.second;
195 if (!srcIV.use_empty() && shift != 0) {
196 auto ivRemap = bodyBuilder.create<AffineApplyOp>(
198 bodyBuilder.getSingleDimShiftAffineMap(
199 -
static_cast<int64_t
>(srcForOp.getStepAsInt() * shift)),
201 operandMap.
map(srcIV, ivRemap);
203 operandMap.
map(srcIV, loopChunkIV);
206 bodyBuilder.clone(*op, operandMap);
209 return AffineForOp();
226 bool unrollPrologueEpilogue) {
227 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
228 "too few/many shifts");
229 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
237 if (!mayBeConstTripCount) {
238 LLVM_DEBUG(forOp.emitRemark(
"non-constant trip count loop not handled"));
241 uint64_t tripCount = *mayBeConstTripCount;
244 "shifts will lead to an invalid transformation\n");
246 int64_t step = forOp.getStepAsInt();
248 unsigned numChildOps = shifts.size();
251 uint64_t maxShift = *llvm::max_element(shifts);
252 if (maxShift >= numChildOps) {
254 forOp.emitWarning(
"not shifting because shifts are unrealistically large");
261 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
263 for (
auto &op : forOp.getBody()->without_terminator()) {
264 auto shift = shifts[pos++];
265 sortedOpGroups[shift].push_back(&op);
273 AffineForOp prologue, epilogue;
278 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
280 auto origLbMap = forOp.getLowerBoundMap();
281 uint64_t lbShift = 0;
283 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
285 if (sortedOpGroups[d].empty())
287 if (!opGroupQueue.empty()) {
289 "Queue expected to be empty when the first block is found");
294 if (lbShift + tripCount * step < d * step) {
298 opGroupQueue, 0, forOp, b);
300 opGroupQueue.clear();
301 lbShift += tripCount * step;
305 opGroupQueue, 0, forOp, b);
312 AffineForOp::getCanonicalizationPatterns(patterns, res.
getContext());
317 config,
nullptr, &erased);
318 if (!erased && !prologue)
328 opGroupQueue.emplace_back(d, sortedOpGroups[d]);
333 for (
unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
334 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
337 opGroupQueue, i, forOp, b);
346 if (unrollPrologueEpilogue && prologue)
348 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
360 if (input.size() <= 1)
363 LLVM_DEBUG(llvm::dbgs() <<
"Index set computation failed!\n");
367 LLVM_DEBUG(llvm::dbgs()
368 <<
"Non-hyperrectangular nests not supported for tiling!\n");
376 template <
typename t>
379 assert(input.size() == tileSizes.size() &&
"Too few/many tile sizes");
381 if (llvm::any_of(input,
382 [](AffineForOp op) {
return op.getNumResults() > 0; })) {
383 LLVM_DEBUG(llvm::dbgs()
384 <<
"Cannot tile nest where a loop has yield values\n");
390 LLVM_DEBUG(llvm::dbgs() <<
"input loops not perfectly nested");
405 auto &ops = src.getBody()->getOperations();
406 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
407 std::prev(ops.end()));
419 AffineForOp rootAffineForOp,
unsigned width,
421 Location loc = rootAffineForOp.getLoc();
424 Operation *topLoop = rootAffineForOp.getOperation();
425 AffineForOp innermostPointLoop;
428 for (
unsigned i = 0; i < width; i++) {
431 AffineForOp pointLoop = b.
create<AffineForOp>(loc, 0, 0);
432 pointLoop.getBody()->getOperations().splice(
435 tiledLoops[2 * width - 1 - i] = pointLoop;
436 topLoop = pointLoop.getOperation();
438 innermostPointLoop = pointLoop;
442 for (
unsigned i = width; i < 2 * width; i++) {
445 AffineForOp tileSpaceLoop = b.
create<AffineForOp>(loc, 0, 0);
446 tileSpaceLoop.getBody()->getOperations().splice(
449 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
450 topLoop = tileSpaceLoop.getOperation();
460 AffineForOp newInterTileLoop,
461 AffineForOp newIntraTileLoop,
471 assert(origLoop.hasConstantLowerBound() &&
472 "expected input loops to have constant lower bound.");
494 lbOperands.push_back(newInterTileLoop.getInductionVar());
495 ubOperands.push_back(newInterTileLoop.getInductionVar());
509 lbOperands.push_back(tileSize);
510 ubOperands.push_back(tileSize);
522 lbBoundExprs.push_back(
523 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
531 ubBoundExprs.push_back(
532 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
533 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
535 ubBoundExprs.append(origUbMap.
getResults().begin(),
541 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
546 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
549 newIntraTileLoop.setStep(origLoop.getStepAsInt());
555 AffineForOp newLoop,
Value tileSize) {
556 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
560 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
571 assert(origLoop.hasConstantLowerBound() &&
572 "expected input loops to have constant lower bound.");
592 ubOperands.push_back(tileSize);
599 int64_t origUpperBound;
604 if (origLoop.hasConstantUpperBound()) {
605 origUpperBound = origLoop.getConstantUpperBound();
612 boundExprs.push_back(
614 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
635 boundExprs.push_back(
637 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
643 newLoop.setUpperBound(ubOperands, ubMap);
646 newLoop.setStep(origLoop.getStepAsInt());
658 assert(!origLoops.empty() &&
"expected atleast one loop in band");
659 assert(origLoops.size() == tileSizes.size() &&
660 "expected tiling parameter for each loop in band.");
662 OpBuilder b(origLoops[0].getOperation());
663 unsigned width = origLoops.size();
666 for (
unsigned i = 0; i < width; ++i) {
671 for (
unsigned i = 0; i < width; ++i) {
673 newLoops[i + width], tileSizes[i]);
686 assert(!origLoops.empty());
687 assert(origLoops.size() == tileSizes.size());
689 OpBuilder b(origLoops[0].getOperation());
690 unsigned width = origLoops.size();
693 for (
unsigned i = 0; i < width; i++) {
694 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
695 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
696 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
697 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
700 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
703 for (
unsigned i = 0; i < width; i++) {
705 std::optional<uint64_t> mayBeConstantCount =
709 newLoops[width + i].setLowerBound(
710 newLoops[i].getInductionVar(), lbMap);
712 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
715 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
719 *mayBeConstantCount * origLoops[i].getStepAsInt());
720 newLoops[width + i].setUpperBound(
721 newLoops[i].getInductionVar(), ubMap);
722 }
else if (largestDiv % tileSizes[i] != 0) {
738 ubOperands.push_back(newLoops[i].getInductionVar());
749 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
750 boundExprs.append(origUbMap.
getResults().begin(),
755 newLoops[width + i].setUpperBound(ubOperands, ubMap);
760 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
761 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
777 AffineForOp rootAffineForOp = origLoops[0];
780 unsigned width = input.size();
794 for (
unsigned i = 0; i < width; i++)
795 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
798 rootAffineForOp.erase();
801 *tiledNest = std::move(tiledLoops);
819 AffineForOp rootAffineForOp = origLoops[0];
820 unsigned width = input.size();
835 for (
unsigned i = 0; i < width; i++)
836 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
839 rootAffineForOp.erase();
842 *tiledNest = std::move(tiledLoops);
854 nestedLoops.push_back(root);
856 if (body.
begin() != std::prev(body.
end(), 2))
859 root = dyn_cast<AffineForOp>(&body.
front());
872 for (AffineForOp forOp : f.getOps<AffineForOp>()) {
875 bands->push_back(band);
882 if (mayBeConstantTripCount.has_value()) {
883 uint64_t tripCount = *mayBeConstantTripCount;
896 uint64_t unrollFactor) {
898 if (mayBeConstantTripCount.has_value() &&
899 *mayBeConstantTripCount < unrollFactor)
909 Block *loopBodyBlock,
Value forOpIV, uint64_t unrollFactor,
927 for (
unsigned i = 1; i < unrollFactor; i++) {
931 operandMap.
map(iterArgs, lastYielded);
936 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
937 operandMap.
map(forOpIV, ivUnroll);
941 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++) {
943 annotateFn(i, clonedOp, builder);
950 for (
unsigned i = 0, e = lastYielded.size(); i < e; i++) {
951 Operation *defOp = yieldedValues[i].getDefiningOp();
952 if (defOp && defOp->
getBlock() == loopBodyBlock)
953 lastYielded[i] = operandMap.
lookup(yieldedValues[i]);
959 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++)
960 annotateFn(0, &*it, builder);
969 uint64_t unrollFactor) {
972 auto cleanupForOp = cast<AffineForOp>(builder.
clone(*forOp));
976 auto results = forOp.getResults();
977 auto cleanupResults = cleanupForOp.getResults();
978 auto cleanupIterOperands = cleanupForOp.getInits();
980 for (
auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
981 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
982 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
991 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
997 forOp.setUpperBound(cleanupOperands, cleanupMap);
1004 AffineForOp forOp, uint64_t unrollFactor,
1006 bool cleanUpUnroll) {
1007 assert(unrollFactor > 0 &&
"unroll factor should be positive");
1010 if (unrollFactor == 1) {
1011 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1018 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1022 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1023 if (cleanUpUnroll) {
1037 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1038 forOp.getUpperBoundMap().getNumResults() != 1)
1044 assert(
false &&
"cleanup loop lower bound map for single result lower "
1045 "and upper bound maps can always be determined");
1048 ValueRange iterArgs(forOp.getRegionIterArgs());
1049 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1052 int64_t step = forOp.getStepAsInt();
1053 forOp.setStep(step * unrollFactor);
1055 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1058 auto d0 = b.getAffineDimExpr(0);
1059 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1060 return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1063 iterArgs, yieldedValues);
1071 uint64_t unrollJamFactor) {
1073 if (mayBeConstantTripCount.has_value() &&
1074 *mayBeConstantTripCount < unrollJamFactor)
1082 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1083 for (
auto controlOperand : aForOp.getControlOperands()) {
1084 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1085 return WalkResult::interrupt();
1089 return !walkResult.wasInterrupted();
1094 uint64_t unrollJamFactor) {
1095 assert(unrollJamFactor > 0 &&
"unroll jam factor should be positive");
1098 if (unrollJamFactor == 1) {
1099 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1106 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1110 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1111 LLVM_DEBUG(llvm::dbgs() <<
"[failed] trip count < unroll-jam factor\n");
1127 forOp.walk([&](AffineForOp aForOp) {
1128 if (aForOp.getNumIterOperands() > 0)
1129 loopsWithIterArgs.push_back(aForOp);
1134 if (forOp.getNumIterOperands() > 0)
1144 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1145 forOp.getUpperBoundMap().getNumResults() != 1)
1148 assert(
false &&
"cleanup loop lower bound map for single result lower "
1149 "and upper bound maps can always be determined");
1161 for (AffineForOp oldForOp : loopsWithIterArgs) {
1163 ValueRange oldIterOperands = oldForOp.getInits();
1164 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1166 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1169 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1170 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1171 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1175 bool forOpReplaced = oldForOp == forOp;
1176 AffineForOp newForOp =
1177 cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1178 rewriter, dupIterOperands,
false,
1180 return dupYieldOperands;
1182 newLoopsWithIterArgs.push_back(newForOp);
1187 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1188 unsigned oldNumIterArgs = oldIterArgs.size();
1189 ValueRange newResults = newForOp.getResults();
1190 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1191 assert(oldNumIterArgs == oldNumResults &&
1192 "oldNumIterArgs must be the same as oldNumResults");
1193 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1194 for (
unsigned j = 0;
j < oldNumIterArgs; ++
j) {
1198 operandMaps[i - 1].map(newIterArgs[
j],
1199 newIterArgs[i * oldNumIterArgs +
j]);
1200 operandMaps[i - 1].map(newResults[
j],
1201 newResults[i * oldNumResults +
j]);
1207 int64_t step = forOp.getStepAsInt();
1208 forOp.setStep(step * unrollJamFactor);
1210 auto forOpIV = forOp.getInductionVar();
1212 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1213 for (
auto &subBlock : subBlocks) {
1216 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1225 builder.
create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1226 operandMaps[i - 1].map(forOpIV, ivUnroll);
1229 for (
auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1230 builder.
clone(*it, operandMaps[i - 1]);
1233 for (
auto newForOp : newLoopsWithIterArgs) {
1234 unsigned oldNumIterOperands =
1235 newForOp.getNumIterOperands() / unrollJamFactor;
1236 unsigned numControlOperands = newForOp.getNumControlOperands();
1237 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1238 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1239 assert(oldNumIterOperands == oldNumYieldOperands &&
1240 "oldNumIterOperands must be the same as oldNumYieldOperands");
1241 for (
unsigned j = 0;
j < oldNumIterOperands; ++
j) {
1245 newForOp.setOperand(numControlOperands + i * oldNumIterOperands +
j,
1246 operandMaps[i - 1].lookupOrDefault(
1247 newForOp.getOperand(numControlOperands +
j)));
1249 i * oldNumYieldOperands +
j,
1250 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(
j)));
1254 if (forOp.getNumResults() > 0) {
1260 auto loc = forOp.getLoc();
1261 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1263 unsigned pos = reduction.iterArgPosition;
1264 Value lhs = forOp.getResult(pos);
1267 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1268 rhs = forOp.getResult(i * oldNumResults + pos);
1274 assert(op &&
"Reduction op should have been created");
1278 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1290 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1291 auto &forOpABody = forOpA.getBody()->getOperations();
1292 auto &forOpBBody = forOpB.getBody()->getOperations();
1298 forOpABody, forOpABody.begin(),
1299 std::prev(forOpABody.end()));
1302 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1303 std::prev(forOpBBody.end()));
1305 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1316 unsigned maxLoopDepth = loops.size();
1318 loopPermMapInv.resize(maxLoopDepth);
1319 for (
unsigned i = 0; i < maxLoopDepth; ++i)
1320 loopPermMapInv[loopPermMap[i]] = i;
1327 for (
const auto &depComps : depCompsVec) {
1328 assert(depComps.size() >= maxLoopDepth);
1331 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1332 unsigned permIndex = loopPermMapInv[
j];
1333 assert(depComps[permIndex].lb);
1334 int64_t depCompLb = *depComps[permIndex].lb;
1350 assert(loopPermMap.size() == loops.size());
1351 unsigned maxLoopDepth = loops.size();
1352 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1359 bool LLVM_ATTRIBUTE_UNUSED
1361 assert(!loops.empty() &&
"no loops provided");
1364 auto hasTwoElements = [](
Block *block) {
1365 auto secondOpIt = std::next(block->begin());
1366 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1369 auto enclosingLoop = loops.front();
1370 for (
auto loop : loops.drop_front()) {
1371 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1373 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1375 enclosingLoop = loop;
1384 assert(input.size() == permMap.size() &&
"invalid permutation map size");
1388 llvm::sort(checkPermMap);
1390 [](
const auto &en) {
return en.value() != en.index(); }))
1391 assert(
false &&
"invalid permutation map");
1394 if (input.size() < 2)
1402 for (
unsigned i = 0, e = input.size(); i < e; ++i)
1403 invPermMap.push_back({permMap[i], i});
1404 llvm::sort(invPermMap);
1408 if (permMap.back() != input.size() - 1) {
1409 Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1410 Block *srcBody = ((AffineForOp)input.back()).getBody();
1413 std::prev(srcBody->
end()));
1418 for (
int i = input.size() - 1; i >= 0; --i) {
1421 if (permMap[i] == 0) {
1426 auto *parentBlock = input[0]->getBlock();
1428 input[i]->getBlock()->getOperations(),
1435 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1436 if (i > 0 &&
static_cast<unsigned>(i - 1) == parentPosInInput)
1440 auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1441 destBody->getOperations().splice(destBody->begin(),
1442 input[i]->getBlock()->getOperations(),
1446 return invPermMap[0].second;
1455 if (loops.size() < 2)
1460 unsigned maxLoopDepth = loops.size();
1461 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1466 for (
auto &depComps : depCompsVec) {
1467 assert(depComps.size() >= maxLoopDepth);
1468 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1470 assert(depComp.
lb.has_value() && depComp.
ub.has_value());
1471 if (*depComp.
lb != 0 || *depComp.
ub != 0)
1481 unsigned nextSequentialLoop = numParallelLoops;
1482 unsigned nextParallelLoop = 0;
1483 for (
unsigned i = 0; i < maxLoopDepth; ++i) {
1485 loopPermMap[i] = nextParallelLoop++;
1487 loopPermMap[i] = nextSequentialLoop++;
1495 unsigned loopNestRootIndex =
permuteLoops(loops, loopPermMap);
1496 return loops[loopNestRootIndex];
1510 int64_t offset = 0) {
1511 auto bounds = llvm::to_vector<4>(map->
getResults());
1513 operands->insert(operands->begin() + map->
getNumDims(), iv);
1530 auto originalStep = forOp.getStepAsInt();
1531 auto scaledStep = originalStep * factor;
1532 forOp.setStep(scaledStep);
1537 auto lbMap = forOp.getLowerBoundMap();
1542 auto ubMap = forOp.getUpperBoundMap();
1547 auto iv = forOp.getInductionVar();
1549 for (
auto t : targets) {
1552 auto newForOp = b.
create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1553 ubOperands, ubMap, originalStep);
1554 auto begin = t.getBody()->begin();
1556 auto nOps = t.getBody()->getOperations().size() - 2;
1557 newForOp.getBody()->getOperations().splice(
1558 newForOp.getBody()->getOperations().begin(),
1559 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1561 newForOp.getRegion());
1562 innerLoops.push_back(newForOp);
1570 template <
typename SizeType>
1572 AffineForOp target) {
1578 assert(res.size() == 1 &&
"Expected 1 inner forOp");
1587 for (
auto it : llvm::zip(forOps, sizes)) {
1588 auto step =
stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1589 res.push_back(step);
1590 currentTargets = step;
1597 AffineForOp target) {
1600 assert(loops.size() == 1);
1601 res.push_back(loops[0]);
1607 if (loops.size() < 2)
1610 AffineForOp innermost = loops.back();
1611 AffineForOp outermost = loops.front();
1616 for (AffineForOp loop : loops) {
1618 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1619 loop.getConstantLowerBound() != 0)
1628 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1629 prev = builder.
create<AffineMinOp>(loc, origUbMap, ubOperands);
1631 prev = builder.
create<AffineApplyOp>(loc, origUbMap, ubOperands);
1632 upperBoundSymbols.push_back(prev);
1636 for (AffineForOp loop : loops.drop_front()) {
1637 ub = loop.getUpperBound();
1642 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1643 upperBound = builder.
create<AffineMinOp>(loc, origUbMap, ubOperands);
1645 upperBound = builder.
create<AffineApplyOp>(loc, origUbMap, ubOperands);
1646 upperBoundSymbols.push_back(upperBound);
1648 operands.push_back(prev);
1649 operands.push_back(upperBound);
1651 prev = builder.
create<AffineApplyOp>(
1663 outermost.setUpperBound(prev, newUbMap);
1675 Value previous = outermost.getInductionVar();
1676 for (
unsigned idx = loops.size(); idx > 0; --idx) {
1677 if (idx != loops.size()) {
1679 operands.push_back(previous);
1680 operands.push_back(upperBoundSymbols[idx]);
1681 previous = builder.
create<AffineApplyOp>(
1691 Value inductionVariable;
1693 inductionVariable = previous;
1696 applyOperands.push_back(previous);
1697 applyOperands.push_back(upperBoundSymbols[idx - 1]);
1698 inductionVariable = builder.
create<AffineApplyOp>(
1706 inductionVariable, loops.back().getRegion());
1711 AffineForOp secondOutermostLoop = loops[1];
1712 innermost.getBody()->back().erase();
1713 outermost.getBody()->getOperations().splice(
1715 innermost.getBody()->getOperations());
1716 secondOutermostLoop.erase();
1723 assert(processorId.size() == numProcessors.size());
1724 if (processorId.empty())
1734 Value linearIndex = processorId.front();
1735 for (
unsigned i = 1, e = processorId.size(); i < e; ++i) {
1736 auto mulApplyOp = b.
create<AffineApplyOp>(
1737 loc, mulMap,
ValueRange{linearIndex, numProcessors[i]});
1738 linearIndex = b.
create<AffineApplyOp>(
1739 loc, addMap,
ValueRange{mulApplyOp, processorId[i]});
1742 auto mulApplyOp = b.
create<AffineApplyOp>(
1743 loc, mulMap,
ValueRange{linearIndex, forOp.getStep()});
1745 loc, addMap,
ValueRange{mulApplyOp, forOp.getLowerBound()});
1746 forOp.setLowerBound(lb);
1748 Value step = forOp.getStep();
1749 for (
auto numProcs : numProcessors)
1751 forOp.setStep(step);
1762 Block **copyPlacementBlock,
1767 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1773 auto it = enclosingFors.rbegin();
1774 for (
auto e = enclosingFors.rend(); it != e; ++it) {
1777 if (llvm::is_contained(symbols, it->getInductionVar()))
1781 if (it != enclosingFors.rbegin()) {
1782 auto lastInvariantIV = *std::prev(it);
1783 *copyInPlacementStart =
Block::iterator(lastInvariantIV.getOperation());
1784 *copyOutPlacementStart = std::next(*copyInPlacementStart);
1785 *copyPlacementBlock = lastInvariantIV->getBlock();
1787 *copyInPlacementStart = begin;
1788 *copyOutPlacementStart = end;
1789 *copyPlacementBlock = █
1807 if (bufferShape.size() <= 1)
1810 int64_t numEltPerStride = 1;
1812 for (
int d = bufferShape.size() - 1; d >= 1; d--) {
1813 int64_t dimSize = cast<MemRefType>(region.
memref.
getType()).getDimSize(d);
1815 numEltPerStride *= bufferShape[d];
1819 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1820 strideInfos->push_back({stride, numEltPerStride});
1845 assert(llvm::all_of(lbMaps, [&](
AffineMap lbMap) {
1848 assert(llvm::all_of(ubMaps, [&](
AffineMap ubMap) {
1852 unsigned rank = cast<MemRefType>(memref.
getType()).getRank();
1853 assert(lbMaps.size() == rank &&
"wrong number of lb maps");
1854 assert(ubMaps.size() == rank &&
"wrong number of ub maps");
1859 AffineForOp copyNestRoot;
1861 for (
unsigned d = 0; d < rank; ++d) {
1863 ubOperands, ubMaps[d]);
1865 copyNestRoot = forOp;
1869 auto fastBufOffsetMap =
1871 auto offset = b.
create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1877 fastBufMapOperands.push_back(offset);
1878 fastBufMapOperands.push_back(forOp.getInductionVar());
1879 mayBeDeadApplys.push_back(offset);
1882 memIndices.push_back(forOp.getInductionVar());
1892 for (
auto applyOp : mayBeDeadApplys)
1893 if (applyOp.use_empty())
1898 auto load = b.
create<AffineLoadOp>(loc, memref, memIndices);
1899 b.
create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1900 fastBufMapOperands);
1901 return copyNestRoot;
1906 b.
create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1907 b.
create<AffineStoreOp>(loc, load, memref, memIndices);
1908 return copyNestRoot;
1939 func::FuncOp f = begin->getParentOfType<func::FuncOp>();
1941 Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1950 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1953 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1955 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1962 auto loc = region.
loc;
1963 auto memref = region.
memref;
1964 auto memRefType = cast<MemRefType>(memref.getType());
1966 if (!memRefType.getLayout().isIdentity()) {
1967 LLVM_DEBUG(llvm::dbgs() <<
"Non-identity layout map not yet supported\n");
1977 unsigned rank = memRefType.getRank();
1981 std::vector<SmallVector<int64_t, 4>> lbs;
1985 &fastBufferShape, &lbs, &lbDivisors);
1987 LLVM_DEBUG(llvm::dbgs() <<
"Non-constant region size not supported\n");
1991 if (*numElements == 0) {
1992 LLVM_DEBUG(llvm::dbgs() <<
"Nothing to copy\n");
1997 for (
unsigned i = 0; i < rank; ++i)
2014 fastBufOffsets.reserve(rank);
2015 for (
unsigned d = 0; d < rank; d++) {
2016 assert(lbs[d].size() == cst->
getNumCols() - rank &&
"incorrect bound size");
2018 AffineExpr offset = top.getAffineConstantExpr(0);
2019 for (
unsigned j = 0, e = cst->
getNumCols() - rank - 1;
j < e;
j++)
2020 offset = offset + lbs[d][
j] * top.getAffineDimExpr(
j);
2021 assert(lbDivisors[d] > 0);
2023 (offset + lbs[d][cst->
getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
2027 if (
auto caf = dyn_cast<AffineConstantExpr>(offset)) {
2028 auto indexVal = caf.getValue();
2029 if (indexVal == 0) {
2030 memIndices.push_back(zeroIndex);
2032 memIndices.push_back(
2033 top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2040 memIndices.push_back(b.
create<AffineApplyOp>(loc, map, regionSymbols));
2043 bufIndices.push_back(zeroIndex);
2047 fastBufOffsets.push_back(offset);
2054 bool existingBuf = fastBufferMap.count(memref) > 0;
2057 auto fastMemRefType =
2064 prologue.
create<memref::AllocOp>(loc, fastMemRefType).getResult();
2066 fastBufferMap[memref] = fastMemRef;
2070 *sizeInBytes = maySizeInBytes.value_or(0);
2073 <<
"Creating fast buffer of type " << fastMemRefType
2078 fastMemRef = fastBufferMap[memref];
2081 auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2084 Value numEltPerDmaStride;
2091 if (dmaStrideInfos.size() > 1) {
2092 LLVM_DEBUG(llvm::dbgs() <<
"Only up to one level of stride supported\n");
2096 if (!dmaStrideInfos.empty()) {
2098 top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2099 numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2100 loc, dmaStrideInfos[0].numEltPerStride);
2108 auto postDomFilter = std::prev(end);
2120 regionSymbols, ubMaps,
2121 regionSymbols, fastBufOffsets,
2125 copyNests.insert(copyNest);
2129 if (region.
isWrite() && isCopyOutAtEndOfBlock)
2136 auto tagMemRef = prologue.
create<memref::AllocOp>(loc, tagMemRefType);
2144 fastMemRef, bufAffineMap, bufIndices,
2145 tagMemRef, tagAffineMap, tagIndices,
2146 numElementsSSA, dmaStride, numEltPerDmaStride);
2150 loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2151 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2152 dmaStride, numEltPerDmaStride);
2155 if (isCopyOutAtEndOfBlock)
2164 auto tagDeallocOp = epilogue.
create<memref::DeallocOp>(loc, tagMemRef);
2165 if (*nEnd == end && isCopyOutAtEndOfBlock)
2173 auto bufDeallocOp = epilogue.
create<memref::DeallocOp>(loc, fastMemRef);
2176 if (!copyOptions.
generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2188 remapExprs.reserve(rank);
2189 for (
unsigned i = 0; i < rank; i++) {
2194 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2196 auto indexRemap =
AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2201 bool isBeginAtStartOfBlock = (begin == block->
begin());
2202 if (!isBeginAtStartOfBlock)
2203 prevOfBegin = std::prev(begin);
2213 *nBegin = isBeginAtStartOfBlock ? block->
begin() : std::next(prevOfBegin);
2225 if (
auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2226 rank = loadOp.getMemRefType().getRank();
2227 region->
memref = loadOp.getMemRef();
2229 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2230 rank = storeOp.getMemRefType().getRank();
2231 region->
memref = storeOp.getMemRef();
2234 assert(
false &&
"expected load or store op");
2237 auto memRefType = cast<MemRefType>(region->
memref.
getType());
2238 if (!memRefType.hasStaticShape())
2247 ivs.resize(numParamLoopIVs);
2251 regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2254 for (
unsigned d = 0; d < rank; d++) {
2255 auto dimSize = memRefType.getDimSize(d);
2256 assert(dimSize > 0 &&
"filtered dynamic shapes above");
2257 regionCst->addBound(BoundType::LB, d, 0);
2258 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2266 std::optional<Value> filterMemRef,
2271 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2272 "Inconsistent block begin/end args");
2273 assert(end != end->getBlock()->end() &&
"end can't be the block terminator");
2275 Block *block = begin->getBlock();
2281 LLVM_DEBUG(llvm::dbgs() <<
"Generating copies at depth " << copyDepth
2283 LLVM_DEBUG(llvm::dbgs() <<
"from begin: " << *begin <<
"\n");
2284 LLVM_DEBUG(llvm::dbgs() <<
"to inclusive end: " << *std::prev(end) <<
"\n");
2289 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2290 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2302 if (
auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2303 if ((filterMemRef.has_value() && filterMemRef != loadOp.getMemRef()) ||
2304 (loadOp.getMemRefType().getMemorySpaceAsInt() !=
2307 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2308 if ((filterMemRef.has_value() && filterMemRef != storeOp.getMemRef()) ||
2309 storeOp.getMemRefType().getMemorySpaceAsInt() !=
2318 auto region = std::make_unique<MemRefRegion>(opInst->
getLoc());
2319 if (failed(region->compute(opInst, copyDepth,
nullptr,
2321 LLVM_DEBUG(llvm::dbgs()
2322 <<
"Error obtaining memory region: semi-affine maps?\n");
2323 LLVM_DEBUG(llvm::dbgs() <<
"over-approximating to the entire memref\n");
2326 opInst->
emitError(
"non-constant memref sizes not yet supported"));
2347 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2349 const auto *
const it = targetRegions.find(region->memref);
2350 if (it == targetRegions.end())
2354 if (failed(it->second->unionBoundingBox(*region))) {
2355 LLVM_DEBUG(llvm::dbgs()
2356 <<
"Memory region bounding box failed; "
2357 "over-approximating to the entire memref\n");
2361 "non-constant memref sizes not yet supported"));
2365 it->second->getConstraints()->clearAndCopyFrom(
2366 *region->getConstraints());
2369 region->getConstraints()->clearAndCopyFrom(
2370 *it->second->getConstraints());
2375 bool existsInRead = updateRegion(readRegions);
2378 bool existsInWrite = updateRegion(writeRegions);
2383 if (region->isWrite() && !existsInWrite) {
2384 writeRegions[region->memref] = std::move(region);
2385 }
else if (!region->isWrite() && !existsInRead) {
2386 readRegions[region->memref] = std::move(region);
2391 LLVM_DEBUG(begin->emitError(
2392 "copy generation failed for one or more memref's in this block\n"));
2396 uint64_t totalCopyBuffersSizeInBytes = 0;
2398 auto processRegions =
2399 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2401 for (
const auto ®ionEntry : regions) {
2405 Block *copyPlacementBlock;
2407 *regionEntry.second, *block, begin, end, ©PlacementBlock,
2408 ©InPlacementStart, ©OutPlacementStart);
2410 uint64_t sizeInBytes;
2413 *regionEntry.second, block, begin, end, copyPlacementBlock,
2414 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2415 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2416 if (succeeded(iRet)) {
2420 totalCopyBuffersSizeInBytes += sizeInBytes;
2422 ret = ret & succeeded(iRet);
2425 processRegions(readRegions);
2426 processRegions(writeRegions);
2429 LLVM_DEBUG(begin->emitError(
2430 "copy generation failed for one or more memref's in this block\n"));
2436 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2437 LLVM_DEBUG(forOp.emitRemark()
2439 <<
" KiB of copy buffers in fast memory space for this block");
2444 "total size of all copy buffers' for this block exceeds fast memory "
2457 std::prev(forOp.getBody()->end()), copyOptions,
2458 filterMemRef, copyNests);
2465 auto begin = analyzedOp->getIterator();
2466 auto end = std::next(begin);
2470 auto err =
generateCopy(memrefRegion, block, begin, end, block, begin, end,
2471 copyOptions, fastBufferMap, copyNests,
2476 const auto &en = fastBufferMap.find(memrefRegion.
memref);
2478 if (en == fastBufferMap.end())
2480 result.
alloc = en->second.getDefiningOp();
2481 assert(result.
alloc &&
"fast buffer expected to be locally allocated");
2482 assert(copyNests.size() <= 1 &&
"At most one copy nest is expected.");
2483 result.
copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2492 assert(currLoopDepth <= depthToLoops.size() &&
"Unexpected currLoopDepth");
2493 if (currLoopDepth == depthToLoops.size())
2494 depthToLoops.emplace_back();
2496 for (
auto &op : *block) {
2497 if (
auto forOp = dyn_cast<AffineForOp>(op)) {
2498 depthToLoops[currLoopDepth].push_back(forOp);
2507 for (
auto &block : func)
2511 if (!depthToLoops.empty()) {
2512 assert(depthToLoops.back().empty() &&
"Last loop level is not empty?");
2513 depthToLoops.pop_back();
2530 return b.
create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2544 auto *context = loops[0].getContext();
2548 llvm::append_range(ops, loops);
2558 for (
auto loop : loops) {
2561 assert(loop.getStepAsInt() == 1 &&
"point loop step expected to be one");
2565 unsigned fullTileLbPos, fullTileUbPos;
2568 nullptr, &fullTileLbPos,
2570 LLVM_DEBUG(llvm::dbgs() <<
"Can't get constant diff pair for a loop\n");
2579 fullTileLb.assign(fLb.begin(), fLb.end());
2580 fullTileUb.assign(fUb.begin(), fUb.end());
2583 for (
auto lbIndex : lbIndices)
2584 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2585 cst.
atIneq(lbIndex, i) = fullTileLb[i] - cst.
atIneq(lbIndex, i);
2588 for (
auto ubIndex : ubIndices)
2589 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2590 cst.
atIneq(ubIndex, i) -= fullTileUb[i];
2613 return b.
create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2618 static LogicalResult
2621 fullTileLoops.reserve(inputNest.size());
2626 for (
auto loop : inputNest) {
2628 if (loop.getStepAsInt() != 1) {
2629 LLVM_DEBUG(llvm::dbgs()
2630 <<
"[tile separation] non-unit stride not implemented\n");
2638 unsigned lbPos, ubPos;
2641 nullptr, &lbPos, &ubPos) ||
2643 LLVM_DEBUG(llvm::dbgs() <<
"[tile separation] Can't get constant diff / "
2644 "equalities not yet handled\n");
2659 fullTileLoops.push_back(fullTileLoop);
2665 operandMap.
map(loopEn.value().getInductionVar(),
2666 fullTileLoops[loopEn.index()].getInductionVar());
2668 for (
auto &op : inputNest.back().getBody()->without_terminator())
2669 b.
clone(op, operandMap);
2676 if (inputNest.empty())
2679 auto firstLoop = inputNest[0];
2682 auto prevLoop = firstLoop;
2683 for (
auto loop : inputNest.drop_front(1)) {
2684 assert(loop->getParentOp() == prevLoop &&
"input not contiguously nested");
2692 if (!fullTileLoops.empty())
2693 fullTileLoops.front().erase();
2701 fullTileLoops.front().erase();
2702 LLVM_DEBUG(llvm::dbgs() <<
"All tiles are full tiles, or failure creating "
2703 "separation condition\n");
2708 Block *thenBlock = ifOp.getThenBlock();
2709 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2711 std::prev(thenBlock->
end()),
2712 outermostFullTileLoop->getBlock()->getOperations(),
2717 Block *elseBlock = ifOp.getElseBlock();
2719 firstLoop->getBlock()->getOperations(),
2723 *fullTileNest = std::move(fullTileLoops);
2729 LogicalResult result(failure());
2732 if (loops.size() <= 1)
2740 for (
unsigned i = 0, e = loops.size(); i < e; ++i) {
2741 operandsDefinedAbove[i] = i;
2742 for (
unsigned j = 0;
j < i; ++
j) {
2744 operandsDefinedAbove[i] =
j;
2753 for (
unsigned end = loops.size(); end > 0; --end) {
2755 for (; start < end - 1; ++start) {
2757 *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2758 std::next(operandsDefinedAbove.begin(), end));
2761 assert(maxPos == start &&
2762 "expected loop bounds to be known at the start of the band");
2770 if (start != end - 1)
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
static void findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
static LogicalResult generateCopy(const MemRefRegion ®ion, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions ©Options, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > ©Nests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region; generates a copy from th...
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
static void getMultiLevelStrides(const MemRefRegion ®ion, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to ‘memref’ to/from ‘fastMemRef’ and returns the outermost AffineFor...
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
unsigned getNumDims() const
ArrayRef< AffineExpr > getResults() const
unsigned getNumResults() const
unsigned getNumInputs() const
AffineExpr getResult(unsigned idx) const
Block represents an ordered list of Operations.
OpListType::iterator iterator
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Operation * getTerminator()
Get the terminator operation of this block.
OpListType & getOperations()
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
AffineMap getDimIdentityMap()
AffineMap getMultiDimIdentityMap(unsigned rank)
AffineExpr getAffineSymbolExpr(unsigned position)
AffineExpr getAffineConstantExpr(int64_t constant)
AffineExpr getAffineDimExpr(unsigned position)
MLIRContext * getContext() const
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteStrictness strictMode
Strict mode can restrict the ops that are added to the worklist during the rewrite.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class represents a diagnostic that is inflight and set to be reported.
An integer set representing a conjunction of one or more affine equalities and inequalities.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Operation * getOperation()
Inherit getOperation from OpState.
This class implements the operand iterators for the Operation class.
Operation is the basic unit of execution within MLIR.
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Location getLoc()
The source location the operation was defined or derived from.
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Block * getBlock()
Returns the operation block that contains this operation.
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
MLIRContext * getContext() const
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
bool use_empty() const
Returns true if this value has no uses.
Type getType() const
Return the type of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
static WalkResult advance()
AffineBound represents a lower or upper bound in the for operation.
Value getOperand(unsigned idx)
operand_range getOperands()
unsigned getNumOperands()
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
AffineMap getAffineMap() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos, AffineValueMap &vmap, MLIRContext *context) const
Returns the bound for the variable at pos from the inequality at ineqPos as a 1-d affine value map (a...
Specialization of arith.constant op that returns an integer of index type.
std::optional< DynamicAPInt > getConstantBoundOnDimSize(unsigned pos, SmallVectorImpl< DynamicAPInt > *lb=nullptr, DynamicAPInt *boundFloorDivisor=nullptr, SmallVectorImpl< DynamicAPInt > *ub=nullptr, unsigned *minLbPos=nullptr, unsigned *minUbPos=nullptr) const
Returns the smallest known constant bound for the extent of the specified variable (pos^th),...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
unsigned getNumSymbolVars() const
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
unsigned getNumVars() const
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumDimAndSymbolVars() const
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
unsigned getNumDimVars() const
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, std::optional< Value > filterMemRef, DenseSet< Operation * > ©Nests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions ©Options, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
AffineForOp sinkSequentialLoops(AffineForOp forOp)
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
LogicalResult applyOpPatternsAndFold(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region ®ion)
Replace all uses of orig within the given region with replacement.
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
@ ExistingOps
Only pre-existing ops are processed.
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
uint64_t fastMemCapacityBytes
Result for calling generateCopyForMemRegion.
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
FlatAffineValueConstraints * getConstraints()
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, std::vector< SmallVector< int64_t, 4 >> *lbs=nullptr, SmallVectorImpl< int64_t > *lbDivisors=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Value memref
Memref that this region corresponds to.
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.