28 #include "llvm/ADT/MapVector.h"
29 #include "llvm/ADT/SmallPtrSet.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/raw_ostream.h"
34 #define DEBUG_TYPE "loop-utils"
37 using namespace affine;
38 using namespace presburger;
39 using llvm::SmallMapVector;
70 auto lbMap = forOp.getLowerBoundMap();
71 auto lb = b.
create<AffineApplyOp>(forOp.getLoc(), lbMap,
72 forOp.getLowerBoundOperands());
81 int64_t step = forOp.getStepAsInt();
82 for (
unsigned i = 0, e = tripCountMap.
getNumResults(); i < e; i++) {
83 auto tripCountExpr = tripCountMap.
getResult(i);
84 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
88 b.
create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
92 for (
unsigned i = 0, e = bumpExprs.size(); i < e; i++)
95 cleanupLbOperands.clear();
96 cleanupLbOperands.push_back(lb);
97 cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
105 for (
auto v : bumpValues)
107 v.getDefiningOp()->erase();
117 auto iterOperands = forOp.getInits();
118 auto iterArgs = forOp.getRegionIterArgs();
119 for (
auto e : llvm::zip(iterOperands, iterArgs))
120 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
123 auto outerResults = forOp.getResults();
124 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
125 for (
auto e : llvm::zip(outerResults, innerResults))
126 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
134 if (!tripCount || *tripCount != 1)
137 if (forOp.getLowerBoundMap().getNumResults() != 1)
141 auto iv = forOp.getInductionVar();
142 auto *parentBlock = forOp->getBlock();
143 if (!iv.use_empty()) {
144 if (forOp.hasConstantLowerBound()) {
145 OpBuilder topBuilder(forOp->getParentOfType<func::FuncOp>().getBody());
147 forOp.getLoc(), forOp.getConstantLowerBound());
148 iv.replaceAllUsesWith(constOp);
150 auto lbOperands = forOp.getLowerBoundOperands();
151 auto lbMap = forOp.getLowerBoundMap();
155 iv.replaceAllUsesWith(lbOperands[0]);
158 builder.
create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
168 forOp.getBody()->back().erase();
170 forOp.getBody()->getOperations());
185 unsigned offset, AffineForOp srcForOp,
OpBuilder b) {
186 auto lbOperands = srcForOp.getLowerBoundOperands();
187 auto ubOperands = srcForOp.getUpperBoundOperands();
193 b.
create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
194 ubMap, srcForOp.getStepAsInt());
195 auto loopChunkIV = loopChunk.getInductionVar();
196 auto srcIV = srcForOp.getInductionVar();
201 for (
const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
202 uint64_t shift = it.first;
203 auto ops = it.second;
208 if (!srcIV.use_empty() && shift != 0) {
209 auto ivRemap = bodyBuilder.create<AffineApplyOp>(
211 bodyBuilder.getSingleDimShiftAffineMap(
212 -
static_cast<int64_t
>(srcForOp.getStepAsInt() * shift)),
214 operandMap.
map(srcIV, ivRemap);
216 operandMap.
map(srcIV, loopChunkIV);
219 bodyBuilder.clone(*op, operandMap);
222 return AffineForOp();
239 bool unrollPrologueEpilogue) {
240 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
241 "too few/many shifts");
242 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
250 if (!mayBeConstTripCount) {
251 LLVM_DEBUG(forOp.emitRemark(
"non-constant trip count loop not handled"));
254 uint64_t tripCount = *mayBeConstTripCount;
257 "shifts will lead to an invalid transformation\n");
259 int64_t step = forOp.getStepAsInt();
261 unsigned numChildOps = shifts.size();
264 uint64_t maxShift = *std::max_element(shifts.begin(), shifts.end());
265 if (maxShift >= numChildOps) {
267 forOp.emitWarning(
"not shifting because shifts are unrealistically large");
274 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
276 for (
auto &op : forOp.getBody()->without_terminator()) {
277 auto shift = shifts[pos++];
278 sortedOpGroups[shift].push_back(&op);
286 AffineForOp prologue, epilogue;
291 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
293 auto origLbMap = forOp.getLowerBoundMap();
294 uint64_t lbShift = 0;
296 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
298 if (sortedOpGroups[d].empty())
300 if (!opGroupQueue.empty()) {
302 "Queue expected to be empty when the first block is found");
307 if (lbShift + tripCount * step < d * step) {
311 opGroupQueue, 0, forOp, b);
313 opGroupQueue.clear();
314 lbShift += tripCount * step;
318 opGroupQueue, 0, forOp, b);
325 AffineForOp::getCanonicalizationPatterns(patterns, res.
getContext());
330 config,
nullptr, &erased);
331 if (!erased && !prologue)
341 opGroupQueue.emplace_back(d, sortedOpGroups[d]);
346 for (
unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
347 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
350 opGroupQueue, i, forOp, b);
359 if (unrollPrologueEpilogue && prologue)
361 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
373 if (input.size() <= 1)
376 LLVM_DEBUG(llvm::dbgs() <<
"Index set computation failed!\n");
380 LLVM_DEBUG(llvm::dbgs()
381 <<
"Non-hyperrectangular nests not supported for tiling!\n");
389 template <
typename t>
392 assert(input.size() == tileSizes.size() &&
"Too few/many tile sizes");
394 if (llvm::any_of(input,
396 LLVM_DEBUG(llvm::dbgs()
397 <<
"Cannot tile nest where a loop has yield values\n");
403 LLVM_DEBUG(llvm::dbgs() <<
"input loops not perfectly nested");
417 auto &ops = src.getBody()->getOperations();
418 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
419 std::prev(ops.end()));
431 AffineForOp rootAffineForOp,
unsigned width,
433 Location loc = rootAffineForOp.getLoc();
436 Operation *topLoop = rootAffineForOp.getOperation();
437 AffineForOp innermostPointLoop;
440 for (
unsigned i = 0; i < width; i++) {
443 AffineForOp pointLoop = b.
create<AffineForOp>(loc, 0, 0);
444 pointLoop.getBody()->getOperations().splice(
447 tiledLoops[2 * width - 1 - i] = pointLoop;
448 topLoop = pointLoop.getOperation();
450 innermostPointLoop = pointLoop;
454 for (
unsigned i = width; i < 2 * width; i++) {
457 AffineForOp tileSpaceLoop = b.
create<AffineForOp>(loc, 0, 0);
458 tileSpaceLoop.getBody()->getOperations().splice(
461 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
462 topLoop = tileSpaceLoop.getOperation();
472 AffineForOp newInterTileLoop,
473 AffineForOp newIntraTileLoop,
483 assert(origLoop.hasConstantLowerBound() &&
484 "expected input loops to have constant lower bound.");
506 lbOperands.push_back(newInterTileLoop.getInductionVar());
507 ubOperands.push_back(newInterTileLoop.getInductionVar());
521 lbOperands.push_back(tileSize);
522 ubOperands.push_back(tileSize);
534 lbBoundExprs.push_back(
535 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
543 ubBoundExprs.push_back(
544 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
545 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
547 ubBoundExprs.append(origUbMap.
getResults().begin(),
553 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
558 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
561 newIntraTileLoop.setStep(origLoop.getStepAsInt());
567 AffineForOp newLoop,
Value tileSize) {
568 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
572 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
583 assert(origLoop.hasConstantLowerBound() &&
584 "expected input loops to have constant lower bound.");
604 ubOperands.push_back(tileSize);
611 int64_t origUpperBound;
616 if (origLoop.hasConstantUpperBound()) {
617 origUpperBound = origLoop.getConstantUpperBound();
624 boundExprs.push_back(
626 (origUpperBoundExpr - origLowerBoundExpr).
ceilDiv(tileParameter));
647 boundExprs.push_back(
649 (origUpperBoundExpr - origLowerBoundExpr).
ceilDiv(tileParameter));
655 newLoop.setUpperBound(ubOperands, ubMap);
658 newLoop.setStep(origLoop.getStepAsInt());
670 assert(!origLoops.empty() &&
"expected atleast one loop in band");
671 assert(origLoops.size() == tileSizes.size() &&
672 "expected tiling parameter for each loop in band.");
674 OpBuilder b(origLoops[0].getOperation());
675 unsigned width = origLoops.size();
678 for (
unsigned i = 0; i < width; ++i) {
683 for (
unsigned i = 0; i < width; ++i) {
685 newLoops[i + width], tileSizes[i]);
698 assert(!origLoops.empty());
699 assert(origLoops.size() == tileSizes.size());
701 OpBuilder b(origLoops[0].getOperation());
702 unsigned width = origLoops.size();
705 for (
unsigned i = 0; i < width; i++) {
706 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
707 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
708 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
709 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
712 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
715 for (
unsigned i = 0; i < width; i++) {
717 std::optional<uint64_t> mayBeConstantCount =
721 newLoops[width + i].setLowerBound(
722 newLoops[i].getInductionVar(), lbMap);
724 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
727 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
731 *mayBeConstantCount * origLoops[i].getStepAsInt());
732 newLoops[width + i].setUpperBound(
733 newLoops[i].getInductionVar(), ubMap);
734 }
else if (largestDiv % tileSizes[i] != 0) {
750 ubOperands.push_back(newLoops[i].getInductionVar());
761 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
762 boundExprs.append(origUbMap.
getResults().begin(),
767 newLoops[width + i].setUpperBound(ubOperands, ubMap);
772 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
773 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
789 AffineForOp rootAffineForOp = origLoops[0];
792 unsigned width = input.size();
806 for (
unsigned i = 0; i < width; i++)
807 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
810 rootAffineForOp.erase();
813 *tiledNest = std::move(tiledLoops);
832 AffineForOp rootAffineForOp = origLoops[0];
833 unsigned width = input.size();
848 for (
unsigned i = 0; i < width; i++)
849 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
852 rootAffineForOp.erase();
855 *tiledNest = std::move(tiledLoops);
867 nestedLoops.push_back(root);
869 if (body.
begin() != std::prev(body.
end(), 2))
872 root = dyn_cast<AffineForOp>(&body.
front());
885 for (AffineForOp forOp : f.getOps<AffineForOp>()) {
888 bands->push_back(band);
895 if (mayBeConstantTripCount.has_value()) {
896 uint64_t tripCount = *mayBeConstantTripCount;
909 uint64_t unrollFactor) {
911 if (mayBeConstantTripCount.has_value() &&
912 *mayBeConstantTripCount < unrollFactor)
922 Block *loopBodyBlock,
Value forOpIV, uint64_t unrollFactor,
940 for (
unsigned i = 1; i < unrollFactor; i++) {
944 operandMap.
map(iterArgs, lastYielded);
949 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
950 operandMap.
map(forOpIV, ivUnroll);
954 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++) {
956 annotateFn(i, clonedOp, builder);
963 for (
unsigned i = 0, e = lastYielded.size(); i < e; i++) {
964 Operation *defOp = yieldedValues[i].getDefiningOp();
965 if (defOp && defOp->
getBlock() == loopBodyBlock)
966 lastYielded[i] = operandMap.
lookup(yieldedValues[i]);
972 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++)
973 annotateFn(0, &*it, builder);
982 uint64_t unrollFactor) {
985 auto cleanupForOp = cast<AffineForOp>(builder.
clone(*forOp));
989 auto results = forOp.getResults();
990 auto cleanupResults = cleanupForOp.getResults();
991 auto cleanupIterOperands = cleanupForOp.getInits();
993 for (
auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
994 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
995 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
1004 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
1010 forOp.setUpperBound(cleanupOperands, cleanupMap);
1017 AffineForOp forOp, uint64_t unrollFactor,
1019 bool cleanUpUnroll) {
1020 assert(unrollFactor > 0 &&
"unroll factor should be positive");
1023 if (unrollFactor == 1) {
1024 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1031 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1035 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1036 if (cleanUpUnroll) {
1050 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1051 forOp.getUpperBoundMap().getNumResults() != 1)
1057 assert(
false &&
"cleanup loop lower bound map for single result lower "
1058 "and upper bound maps can always be determined");
1061 ValueRange iterArgs(forOp.getRegionIterArgs());
1062 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1065 int64_t step = forOp.getStepAsInt();
1066 forOp.setStep(step * unrollFactor);
1068 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1071 auto d0 = b.getAffineDimExpr(0);
1072 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1073 return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1076 iterArgs, yieldedValues);
1084 uint64_t unrollJamFactor) {
1086 if (mayBeConstantTripCount.has_value() &&
1087 *mayBeConstantTripCount < unrollJamFactor)
1095 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1096 for (
auto controlOperand : aForOp.getControlOperands()) {
1097 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1098 return WalkResult::interrupt();
1102 return !walkResult.wasInterrupted();
1110 std::vector<std::pair<Block::iterator, Block::iterator>>
subBlocks;
1115 for (
auto &block : region)
1120 for (
auto it = block.
begin(), e = std::prev(block.
end()); it != e;) {
1121 auto subBlockStart = it;
1122 while (it != e && !isa<AffineForOp>(&*it))
1124 if (it != subBlockStart)
1125 subBlocks.emplace_back(subBlockStart, std::prev(it));
1127 while (it != e && isa<AffineForOp>(&*it))
1135 uint64_t unrollJamFactor) {
1136 assert(unrollJamFactor > 0 &&
"unroll jam factor should be positive");
1139 if (unrollJamFactor == 1) {
1140 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1147 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1151 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1152 LLVM_DEBUG(llvm::dbgs() <<
"[failed] trip count < unroll-jam factor\n");
1168 forOp.walk([&](AffineForOp aForOp) {
1169 if (aForOp.getNumIterOperands() > 0)
1170 loopsWithIterArgs.push_back(aForOp);
1175 if (forOp.getNumIterOperands() > 0)
1185 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1186 forOp.getUpperBoundMap().getNumResults() != 1)
1189 assert(
false &&
"cleanup loop lower bound map for single result lower "
1190 "and upper bound maps can always be determined");
1202 for (AffineForOp oldForOp : loopsWithIterArgs) {
1204 ValueRange oldIterOperands = oldForOp.getInits();
1205 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1207 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1210 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1211 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1212 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1216 bool forOpReplaced = oldForOp == forOp;
1217 AffineForOp newForOp =
1218 cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1219 rewriter, dupIterOperands,
false,
1221 return dupYieldOperands;
1223 newLoopsWithIterArgs.push_back(newForOp);
1228 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1229 unsigned oldNumIterArgs = oldIterArgs.size();
1230 ValueRange newResults = newForOp.getResults();
1231 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1232 assert(oldNumIterArgs == oldNumResults &&
1233 "oldNumIterArgs must be the same as oldNumResults");
1234 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1235 for (
unsigned j = 0;
j < oldNumIterArgs; ++
j) {
1239 operandMaps[i - 1].map(newIterArgs[
j],
1240 newIterArgs[i * oldNumIterArgs +
j]);
1241 operandMaps[i - 1].map(newResults[
j],
1242 newResults[i * oldNumResults +
j]);
1248 int64_t step = forOp.getStepAsInt();
1249 forOp.setStep(step * unrollJamFactor);
1251 auto forOpIV = forOp.getInductionVar();
1253 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1254 for (
auto &subBlock : subBlocks) {
1257 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1266 builder.
create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1267 operandMaps[i - 1].map(forOpIV, ivUnroll);
1270 for (
auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1271 builder.
clone(*it, operandMaps[i - 1]);
1274 for (
auto newForOp : newLoopsWithIterArgs) {
1275 unsigned oldNumIterOperands =
1276 newForOp.getNumIterOperands() / unrollJamFactor;
1277 unsigned numControlOperands = newForOp.getNumControlOperands();
1278 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1279 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1280 assert(oldNumIterOperands == oldNumYieldOperands &&
1281 "oldNumIterOperands must be the same as oldNumYieldOperands");
1282 for (
unsigned j = 0;
j < oldNumIterOperands; ++
j) {
1286 newForOp.setOperand(numControlOperands + i * oldNumIterOperands +
j,
1287 operandMaps[i - 1].lookupOrDefault(
1288 newForOp.getOperand(numControlOperands +
j)));
1290 i * oldNumYieldOperands +
j,
1291 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(
j)));
1295 if (forOp.getNumResults() > 0) {
1301 auto loc = forOp.getLoc();
1302 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1304 unsigned pos = reduction.iterArgPosition;
1305 Value lhs = forOp.getResult(pos);
1308 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1309 rhs = forOp.getResult(i * oldNumResults + pos);
1315 assert(op &&
"Reduction op should have been created");
1319 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1331 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1332 auto &forOpABody = forOpA.getBody()->getOperations();
1333 auto &forOpBBody = forOpB.getBody()->getOperations();
1339 forOpABody, forOpABody.begin(),
1340 std::prev(forOpABody.end()));
1343 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1344 std::prev(forOpBBody.end()));
1346 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1357 unsigned maxLoopDepth = loops.size();
1359 loopPermMapInv.resize(maxLoopDepth);
1360 for (
unsigned i = 0; i < maxLoopDepth; ++i)
1361 loopPermMapInv[loopPermMap[i]] = i;
1368 for (
const auto &depComps : depCompsVec) {
1369 assert(depComps.size() >= maxLoopDepth);
1372 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1373 unsigned permIndex = loopPermMapInv[
j];
1374 assert(depComps[permIndex].lb);
1375 int64_t depCompLb = *depComps[permIndex].lb;
1391 assert(loopPermMap.size() == loops.size());
1392 unsigned maxLoopDepth = loops.size();
1393 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1400 bool LLVM_ATTRIBUTE_UNUSED
1402 assert(!loops.empty() &&
"no loops provided");
1405 auto hasTwoElements = [](
Block *block) {
1406 auto secondOpIt = std::next(block->begin());
1407 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1410 auto enclosingLoop = loops.front();
1411 for (
auto loop : loops.drop_front()) {
1412 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1414 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1416 enclosingLoop = loop;
1425 assert(input.size() == permMap.size() &&
"invalid permutation map size");
1429 llvm::sort(checkPermMap);
1431 [](
const auto &en) {
return en.value() != en.index(); }))
1432 assert(
false &&
"invalid permutation map");
1435 if (input.size() < 2)
1443 for (
unsigned i = 0, e = input.size(); i < e; ++i)
1444 invPermMap.push_back({permMap[i], i});
1445 llvm::sort(invPermMap);
1449 if (permMap.back() != input.size() - 1) {
1450 auto *destBody = input[invPermMap.back().second].getBody();
1451 auto *srcBody = input.back().getBody();
1452 destBody->getOperations().splice(destBody->begin(),
1453 srcBody->getOperations(), srcBody->begin(),
1454 std::prev(srcBody->end()));
1459 for (
int i = input.size() - 1; i >= 0; --i) {
1462 if (permMap[i] == 0) {
1467 auto *parentBlock = input[0]->getBlock();
1469 input[i]->getBlock()->getOperations(),
1476 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1477 if (i > 0 &&
static_cast<unsigned>(i - 1) == parentPosInInput)
1481 auto *destBody = input[parentPosInInput].getBody();
1482 destBody->getOperations().splice(destBody->begin(),
1483 input[i]->getBlock()->getOperations(),
1487 return invPermMap[0].second;
1496 if (loops.size() < 2)
1501 unsigned maxLoopDepth = loops.size();
1502 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1507 for (
auto &depComps : depCompsVec) {
1508 assert(depComps.size() >= maxLoopDepth);
1509 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1511 assert(depComp.
lb.has_value() && depComp.
ub.has_value());
1512 if (*depComp.
lb != 0 || *depComp.
ub != 0)
1518 unsigned numParallelLoops = 0;
1526 unsigned nextSequentialLoop = numParallelLoops;
1527 unsigned nextParallelLoop = 0;
1528 for (
unsigned i = 0; i < maxLoopDepth; ++i) {
1530 loopPermMap[i] = nextParallelLoop++;
1532 loopPermMap[i] = nextSequentialLoop++;
1540 unsigned loopNestRootIndex =
permuteLoops(loops, loopPermMap);
1541 return loops[loopNestRootIndex];
1555 int64_t offset = 0) {
1556 auto bounds = llvm::to_vector<4>(map->
getResults());
1558 operands->insert(operands->begin() + map->
getNumDims(), iv);
1575 auto originalStep = forOp.getStepAsInt();
1576 auto scaledStep = originalStep * factor;
1577 forOp.setStep(scaledStep);
1582 auto lbMap = forOp.getLowerBoundMap();
1587 auto ubMap = forOp.getUpperBoundMap();
1592 auto iv = forOp.getInductionVar();
1594 for (
auto t : targets) {
1597 auto newForOp = b.
create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1598 ubOperands, ubMap, originalStep);
1599 auto begin = t.getBody()->begin();
1601 auto nOps = t.getBody()->getOperations().size() - 2;
1602 newForOp.getBody()->getOperations().splice(
1603 newForOp.getBody()->getOperations().begin(),
1604 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1606 newForOp.getRegion());
1607 innerLoops.push_back(newForOp);
1615 template <
typename SizeType>
1617 AffineForOp target) {
1623 assert(res.size() == 1 &&
"Expected 1 inner forOp");
1632 for (
auto it : llvm::zip(forOps, sizes)) {
1633 auto step =
stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1634 res.push_back(step);
1635 currentTargets = step;
1642 AffineForOp target) {
1645 assert(loops.size() == 1);
1646 res.push_back(loops[0]);
1652 if (loops.size() < 2)
1655 AffineForOp innermost = loops.back();
1656 AffineForOp outermost = loops.front();
1661 for (AffineForOp loop : loops) {
1663 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1664 loop.getConstantLowerBound() != 0)
1673 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1674 prev = builder.
create<AffineMinOp>(loc, origUbMap, ubOperands);
1676 prev = builder.
create<AffineApplyOp>(loc, origUbMap, ubOperands);
1677 upperBoundSymbols.push_back(prev);
1681 for (AffineForOp loop : loops.drop_front()) {
1682 ub = loop.getUpperBound();
1687 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1688 upperBound = builder.
create<AffineMinOp>(loc, origUbMap, ubOperands);
1690 upperBound = builder.
create<AffineApplyOp>(loc, origUbMap, ubOperands);
1691 upperBoundSymbols.push_back(upperBound);
1693 operands.push_back(prev);
1694 operands.push_back(upperBound);
1696 prev = builder.
create<AffineApplyOp>(
1708 outermost.setUpperBound(prev, newUbMap);
1720 Value previous = outermost.getInductionVar();
1721 for (
unsigned idx = loops.size(); idx > 0; --idx) {
1722 if (idx != loops.size()) {
1724 operands.push_back(previous);
1725 operands.push_back(upperBoundSymbols[idx]);
1726 previous = builder.
create<AffineApplyOp>(
1736 Value inductionVariable;
1738 inductionVariable = previous;
1741 applyOperands.push_back(previous);
1742 applyOperands.push_back(upperBoundSymbols[idx - 1]);
1743 inductionVariable = builder.
create<AffineApplyOp>(
1751 inductionVariable, loops.back().getRegion());
1756 AffineForOp secondOutermostLoop = loops[1];
1757 innermost.getBody()->back().erase();
1758 outermost.getBody()->getOperations().splice(
1760 innermost.getBody()->getOperations());
1761 secondOutermostLoop.erase();
1768 assert(processorId.size() == numProcessors.size());
1769 if (processorId.empty())
1779 Value linearIndex = processorId.front();
1780 for (
unsigned i = 1, e = processorId.size(); i < e; ++i) {
1781 auto mulApplyOp = b.
create<AffineApplyOp>(
1782 loc, mulMap,
ValueRange{linearIndex, numProcessors[i]});
1783 linearIndex = b.
create<AffineApplyOp>(
1784 loc, addMap,
ValueRange{mulApplyOp, processorId[i]});
1787 auto mulApplyOp = b.
create<AffineApplyOp>(
1788 loc, mulMap,
ValueRange{linearIndex, forOp.getStep()});
1790 loc, addMap,
ValueRange{mulApplyOp, forOp.getLowerBound()});
1791 forOp.setLowerBound(lb);
1793 Value step = forOp.getStep();
1794 for (
auto numProcs : numProcessors)
1796 forOp.setStep(step);
1807 Block **copyPlacementBlock,
1812 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1818 auto it = enclosingFors.rbegin();
1819 for (
auto e = enclosingFors.rend(); it != e; ++it) {
1822 if (llvm::is_contained(symbols, it->getInductionVar()))
1826 if (it != enclosingFors.rbegin()) {
1827 auto lastInvariantIV = *std::prev(it);
1828 *copyInPlacementStart =
Block::iterator(lastInvariantIV.getOperation());
1829 *copyOutPlacementStart = std::next(*copyInPlacementStart);
1830 *copyPlacementBlock = lastInvariantIV->getBlock();
1832 *copyInPlacementStart = begin;
1833 *copyOutPlacementStart = end;
1834 *copyPlacementBlock = █
1852 if (bufferShape.size() <= 1)
1855 int64_t numEltPerStride = 1;
1857 for (
int d = bufferShape.size() - 1; d >= 1; d--) {
1858 int64_t dimSize = cast<MemRefType>(region.
memref.
getType()).getDimSize(d);
1860 numEltPerStride *= bufferShape[d];
1864 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1865 strideInfos->push_back({stride, numEltPerStride});
1890 assert(llvm::all_of(lbMaps, [&](
AffineMap lbMap) {
1893 assert(llvm::all_of(ubMaps, [&](
AffineMap ubMap) {
1897 unsigned rank = cast<MemRefType>(memref.
getType()).getRank();
1898 assert(lbMaps.size() == rank &&
"wrong number of lb maps");
1899 assert(ubMaps.size() == rank &&
"wrong number of ub maps");
1904 AffineForOp copyNestRoot;
1906 for (
unsigned d = 0; d < rank; ++d) {
1908 ubOperands, ubMaps[d]);
1910 copyNestRoot = forOp;
1914 auto fastBufOffsetMap =
1916 auto offset = b.
create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1922 fastBufMapOperands.push_back(offset);
1923 fastBufMapOperands.push_back(forOp.getInductionVar());
1924 mayBeDeadApplys.push_back(offset);
1927 memIndices.push_back(forOp.getInductionVar());
1937 for (
auto applyOp : mayBeDeadApplys)
1938 if (applyOp.use_empty())
1943 auto load = b.
create<AffineLoadOp>(loc, memref, memIndices);
1944 b.
create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1945 fastBufMapOperands);
1946 return copyNestRoot;
1951 b.
create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1952 b.
create<AffineStoreOp>(loc, load, memref, memIndices);
1953 return copyNestRoot;
1984 func::FuncOp f = begin->getParentOfType<func::FuncOp>();
1986 Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1995 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1998 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
2000 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
2007 auto loc = region.
loc;
2008 auto memref = region.
memref;
2009 auto memRefType = cast<MemRefType>(memref.getType());
2011 if (!memRefType.getLayout().isIdentity()) {
2012 LLVM_DEBUG(llvm::dbgs() <<
"Non-identity layout map not yet supported\n");
2022 unsigned rank = memRefType.getRank();
2026 std::vector<SmallVector<int64_t, 4>> lbs;
2030 &fastBufferShape, &lbs, &lbDivisors);
2032 LLVM_DEBUG(llvm::dbgs() <<
"Non-constant region size not supported\n");
2036 if (*numElements == 0) {
2037 LLVM_DEBUG(llvm::dbgs() <<
"Nothing to copy\n");
2042 for (
unsigned i = 0; i < rank; ++i)
2059 fastBufOffsets.reserve(rank);
2060 for (
unsigned d = 0; d < rank; d++) {
2061 assert(lbs[d].size() == cst->
getNumCols() - rank &&
"incorrect bound size");
2063 AffineExpr offset = top.getAffineConstantExpr(0);
2064 for (
unsigned j = 0, e = cst->
getNumCols() - rank - 1;
j < e;
j++)
2065 offset = offset + lbs[d][
j] * top.getAffineDimExpr(
j);
2066 assert(lbDivisors[d] > 0);
2073 auto indexVal = caf.getValue();
2074 if (indexVal == 0) {
2075 memIndices.push_back(zeroIndex);
2077 memIndices.push_back(
2078 top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2085 memIndices.push_back(b.
create<AffineApplyOp>(loc, map, regionSymbols));
2088 bufIndices.push_back(zeroIndex);
2092 fastBufOffsets.push_back(offset);
2099 bool existingBuf = fastBufferMap.count(memref) > 0;
2102 auto fastMemRefType =
2109 prologue.
create<memref::AllocOp>(loc, fastMemRefType).getResult();
2111 fastBufferMap[memref] = fastMemRef;
2115 *sizeInBytes = maySizeInBytes ? *maySizeInBytes : 0;
2118 <<
"Creating fast buffer of type " << fastMemRefType
2119 <<
" and size " << llvm::divideCeil(*sizeInBytes, 1024)
2123 fastMemRef = fastBufferMap[memref];
2126 auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2128 Value dmaStride =
nullptr;
2129 Value numEltPerDmaStride =
nullptr;
2136 if (dmaStrideInfos.size() > 1) {
2137 LLVM_DEBUG(llvm::dbgs() <<
"Only up to one level of stride supported\n");
2141 if (!dmaStrideInfos.empty()) {
2143 top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2144 numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2145 loc, dmaStrideInfos[0].numEltPerStride);
2153 auto postDomFilter = std::prev(end);
2165 regionSymbols, ubMaps,
2166 regionSymbols, fastBufOffsets,
2170 copyNests.insert(copyNest);
2174 if (region.
isWrite() && isCopyOutAtEndOfBlock)
2181 auto tagMemRef = prologue.
create<memref::AllocOp>(loc, tagMemRefType);
2189 fastMemRef, bufAffineMap, bufIndices,
2190 tagMemRef, tagAffineMap, tagIndices,
2191 numElementsSSA, dmaStride, numEltPerDmaStride);
2195 loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2196 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2197 dmaStride, numEltPerDmaStride);
2200 if (isCopyOutAtEndOfBlock)
2209 auto tagDeallocOp = epilogue.
create<memref::DeallocOp>(loc, tagMemRef);
2210 if (*nEnd == end && isCopyOutAtEndOfBlock)
2218 auto bufDeallocOp = epilogue.
create<memref::DeallocOp>(loc, fastMemRef);
2221 if (!copyOptions.
generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2233 remapExprs.reserve(rank);
2234 for (
unsigned i = 0; i < rank; i++) {
2239 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2241 auto indexRemap =
AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2246 bool isBeginAtStartOfBlock = (begin == block->
begin());
2247 if (!isBeginAtStartOfBlock)
2248 prevOfBegin = std::prev(begin);
2258 *nBegin = isBeginAtStartOfBlock ? block->
begin() : std::next(prevOfBegin);
2270 if (
auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2271 rank = loadOp.getMemRefType().getRank();
2272 region->
memref = loadOp.getMemRef();
2274 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2275 rank = storeOp.getMemRefType().getRank();
2276 region->
memref = storeOp.getMemRef();
2279 assert(
false &&
"expected load or store op");
2282 auto memRefType = cast<MemRefType>(region->
memref.
getType());
2283 if (!memRefType.hasStaticShape())
2292 ivs.resize(numParamLoopIVs);
2296 regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2299 for (
unsigned d = 0; d < rank; d++) {
2300 auto dimSize = memRefType.getDimSize(d);
2301 assert(dimSize > 0 &&
"filtered dynamic shapes above");
2302 regionCst->addBound(BoundType::LB, d, 0);
2303 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2311 std::optional<Value> filterMemRef,
2316 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2317 "Inconsistent block begin/end args");
2318 assert(end != end->getBlock()->end() &&
"end can't be the block terminator");
2320 Block *block = begin->getBlock();
2326 LLVM_DEBUG(llvm::dbgs() <<
"Generating copies at depth " << copyDepth
2328 LLVM_DEBUG(llvm::dbgs() <<
"from begin: " << *begin <<
"\n");
2329 LLVM_DEBUG(llvm::dbgs() <<
"to inclusive end: " << *std::prev(end) <<
"\n");
2334 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2335 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2347 if (
auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2348 if ((filterMemRef.has_value() && filterMemRef != loadOp.getMemRef()) ||
2349 (loadOp.getMemRefType().getMemorySpaceAsInt() !=
2352 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2353 if ((filterMemRef.has_value() && filterMemRef != storeOp.getMemRef()) ||
2354 storeOp.getMemRefType().getMemorySpaceAsInt() !=
2363 auto region = std::make_unique<MemRefRegion>(opInst->
getLoc());
2364 if (
failed(region->compute(opInst, copyDepth,
nullptr,
2366 LLVM_DEBUG(llvm::dbgs()
2367 <<
"Error obtaining memory region: semi-affine maps?\n");
2368 LLVM_DEBUG(llvm::dbgs() <<
"over-approximating to the entire memref\n");
2371 opInst->
emitError(
"non-constant memref sizes not yet supported"));
2392 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2394 const auto *
const it = targetRegions.find(region->memref);
2395 if (it == targetRegions.end())
2399 if (
failed(it->second->unionBoundingBox(*region))) {
2400 LLVM_DEBUG(llvm::dbgs()
2401 <<
"Memory region bounding box failed; "
2402 "over-approximating to the entire memref\n");
2406 "non-constant memref sizes not yet supported"));
2410 it->second->getConstraints()->clearAndCopyFrom(
2411 *region->getConstraints());
2414 region->getConstraints()->clearAndCopyFrom(
2415 *it->second->getConstraints());
2420 bool existsInRead = updateRegion(readRegions);
2423 bool existsInWrite = updateRegion(writeRegions);
2428 if (region->isWrite() && !existsInWrite) {
2429 writeRegions[region->memref] = std::move(region);
2430 }
else if (!region->isWrite() && !existsInRead) {
2431 readRegions[region->memref] = std::move(region);
2436 LLVM_DEBUG(begin->emitError(
2437 "copy generation failed for one or more memref's in this block\n"));
2441 uint64_t totalCopyBuffersSizeInBytes = 0;
2443 auto processRegions =
2444 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2446 for (
const auto ®ionEntry : regions) {
2450 Block *copyPlacementBlock;
2452 *regionEntry.second, *block, begin, end, ©PlacementBlock,
2453 ©InPlacementStart, ©OutPlacementStart);
2455 uint64_t sizeInBytes;
2458 *regionEntry.second, block, begin, end, copyPlacementBlock,
2459 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2460 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2465 totalCopyBuffersSizeInBytes += sizeInBytes;
2470 processRegions(readRegions);
2471 processRegions(writeRegions);
2474 LLVM_DEBUG(begin->emitError(
2475 "copy generation failed for one or more memref's in this block\n"));
2481 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2482 LLVM_DEBUG(forOp.emitRemark()
2483 << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2484 <<
" KiB of copy buffers in fast memory space for this block");
2489 "total size of all copy buffers' for this block exceeds fast memory "
2502 std::prev(forOp.getBody()->end()), copyOptions,
2503 filterMemRef, copyNests);
2510 auto begin = analyzedOp->getIterator();
2511 auto end = std::next(begin);
2515 auto err =
generateCopy(memrefRegion, block, begin, end, block, begin, end,
2516 copyOptions, fastBufferMap, copyNests,
2521 const auto &en = fastBufferMap.find(memrefRegion.
memref);
2523 if (en == fastBufferMap.end())
2525 result.
alloc = en->second.getDefiningOp();
2526 assert(result.
alloc &&
"fast buffer expected to be locally allocated");
2527 assert(copyNests.size() <= 1 &&
"At most one copy nest is expected.");
2528 result.
copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2537 assert(currLoopDepth <= depthToLoops.size() &&
"Unexpected currLoopDepth");
2538 if (currLoopDepth == depthToLoops.size())
2539 depthToLoops.emplace_back();
2541 for (
auto &op : *block) {
2542 if (
auto forOp = dyn_cast<AffineForOp>(op)) {
2543 depthToLoops[currLoopDepth].push_back(forOp);
2552 for (
auto &block : func)
2556 if (!depthToLoops.empty()) {
2557 assert(depthToLoops.back().empty() &&
"Last loop level is not empty?");
2558 depthToLoops.pop_back();
2579 return b.
create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2593 auto *context = loops[0].getContext();
2597 llvm::append_range(ops, loops);
2607 for (
auto loop : loops) {
2610 assert(loop.getStepAsInt() == 1 &&
"point loop step expected to be one");
2614 unsigned fullTileLbPos, fullTileUbPos;
2617 nullptr, &fullTileLbPos,
2619 LLVM_DEBUG(llvm::dbgs() <<
"Can't get constant diff pair for a loop\n");
2628 fullTileLb.assign(fLb.begin(), fLb.end());
2629 fullTileUb.assign(fUb.begin(), fUb.end());
2632 for (
auto lbIndex : lbIndices)
2633 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2634 cst.
atIneq(lbIndex, i) = fullTileLb[i] - cst.
atIneq(lbIndex, i);
2637 for (
auto ubIndex : ubIndices)
2638 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2639 cst.
atIneq(ubIndex, i) -= fullTileUb[i];
2662 return b.
create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2670 fullTileLoops.reserve(inputNest.size());
2675 for (
auto loop : inputNest) {
2677 if (loop.getStepAsInt() != 1) {
2678 LLVM_DEBUG(llvm::dbgs()
2679 <<
"[tile separation] non-unit stride not implemented\n");
2687 unsigned lbPos, ubPos;
2690 nullptr, &lbPos, &ubPos) ||
2692 LLVM_DEBUG(llvm::dbgs() <<
"[tile separation] Can't get constant diff / "
2693 "equalities not yet handled\n");
2708 fullTileLoops.push_back(fullTileLoop);
2714 operandMap.
map(loopEn.value().getInductionVar(),
2715 fullTileLoops[loopEn.index()].getInductionVar());
2717 for (
auto &op : inputNest.back().getBody()->without_terminator())
2718 b.
clone(op, operandMap);
2725 if (inputNest.empty())
2728 auto firstLoop = inputNest[0];
2731 auto prevLoop = firstLoop;
2732 for (
auto loop : inputNest.drop_front(1)) {
2733 assert(loop->getParentOp() == prevLoop &&
"input not contiguously nested");
2741 if (!fullTileLoops.empty())
2742 fullTileLoops.front().erase();
2750 fullTileLoops.front().erase();
2751 LLVM_DEBUG(llvm::dbgs() <<
"All tiles are full tiles, or failure creating "
2752 "separation condition\n");
2757 Block *thenBlock = ifOp.getThenBlock();
2758 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2760 std::prev(thenBlock->
end()),
2761 outermostFullTileLoop->getBlock()->getOperations(),
2766 Block *elseBlock = ifOp.getElseBlock();
2768 firstLoop->getBlock()->getOperations(),
2772 *fullTileNest = std::move(fullTileLoops);
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
static void findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
static LogicalResult generateCopy(const MemRefRegion ®ion, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions ©Options, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > ©Nests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region; generates a copy from th...
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
static void getMultiLevelStrides(const MemRefRegion ®ion, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to ‘memref’ to/from ‘fastMemRef’ and returns the outermost AffineFor...
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
An integer constant appearing in affine expression.
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
unsigned getNumDims() const
ArrayRef< AffineExpr > getResults() const
unsigned getNumResults() const
unsigned getNumInputs() const
AffineExpr getResult(unsigned idx) const
Block represents an ordered list of Operations.
OpListType::iterator iterator
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Operation * getTerminator()
Get the terminator operation of this block.
RetT walk(FnT &&callback)
Walk the operations in this block.
OpListType & getOperations()
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
AffineMap getDimIdentityMap()
AffineMap getMultiDimIdentityMap(unsigned rank)
AffineExpr getAffineSymbolExpr(unsigned position)
AffineExpr getAffineConstantExpr(int64_t constant)
AffineExpr getAffineDimExpr(unsigned position)
MLIRContext * getContext() const
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteStrictness strictMode
Strict mode can restrict the ops that are added to the worklist during the rewrite.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class represents a diagnostic that is inflight and set to be reported.
An integer set representing a conjunction of one or more affine equalities and inequalities.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Operation * getOperation()
Inherit getOperation from OpState.
This class implements the operand iterators for the Operation class.
Operation is the basic unit of execution within MLIR.
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Location getLoc()
The source location the operation was defined or derived from.
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Block * getBlock()
Returns the operation block that contains this operation.
MutableArrayRef< Region > getRegions()
Returns the regions held by this operation.
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
unsigned getNumResults()
Return the number of results held by this operation.
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
MLIRContext * getContext() const
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
bool use_empty() const
Returns true if this value has no uses.
Type getType() const
Return the type of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
static WalkResult advance()
AffineBound represents a lower or upper bound in the for operation.
Value getOperand(unsigned idx)
operand_range getOperands()
unsigned getNumOperands()
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
AffineMap getAffineMap() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos, AffineValueMap &vmap, MLIRContext *context) const
Returns the bound for the variable at pos from the inequality at ineqPos as a 1-d affine value map (a...
Specialization of arith.constant op that returns an integer of index type.
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
unsigned getNumSymbolVars() const
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
ArrayRef< MPInt > getInequality(unsigned idx) const
MPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
unsigned getNumVars() const
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumDimAndSymbolVars() const
std::optional< MPInt > getConstantBoundOnDimSize(unsigned pos, SmallVectorImpl< MPInt > *lb=nullptr, MPInt *boundFloorDivisor=nullptr, SmallVectorImpl< MPInt > *ub=nullptr, unsigned *minLbPos=nullptr, unsigned *minUbPos=nullptr) const
Returns the smallest known constant bound for the extent of the specified variable (pos^th),...
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
unsigned getNumDimVars() const
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, std::optional< Value > filterMemRef, DenseSet< Operation * > ©Nests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
unsigned permuteLoops(MutableArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions ©Options, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
AffineForOp sinkSequentialLoops(AffineForOp forOp)
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
void walk(Operation *op, function_ref< void(Region *)> callback, WalkOrder order)
Walk all of the regions, blocks, or operations nested under (and including) the given operation.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
This header declares functions that assist transformations in the MemRef dialect.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
LogicalResult applyOpPatternsAndFold(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Applies the specified rewrite patterns on ops while also trying to fold these ops.
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region ®ion)
Replace all uses of orig within the given region with replacement.
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
int64_t floorDiv(int64_t lhs, int64_t rhs)
Returns the result of MLIR's floordiv operation on constants.
int64_t ceilDiv(int64_t lhs, int64_t rhs)
Returns the result of MLIR's ceildiv operation on constants.
bool succeeded(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
@ ExistingOps
Only pre-existing ops are processed.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
std::vector< std::pair< Block::iterator, Block::iterator > > subBlocks
This class represents an efficient way to signal success or failure.
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
uint64_t fastMemCapacityBytes
Result for calling generateCopyForMemRegion.
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
FlatAffineValueConstraints * getConstraints()
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, std::vector< SmallVector< int64_t, 4 >> *lbs=nullptr, SmallVectorImpl< int64_t > *lbDivisors=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Value memref
Memref that this region corresponds to.
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.