25 #include "llvm/ADT/MapVector.h"
26 #include "llvm/ADT/SmallPtrSet.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
31 #define DEBUG_TYPE "loop-utils"
34 using namespace affine;
35 using namespace presburger;
36 using llvm::SmallMapVector;
57 auto lbMap = forOp.getLowerBoundMap();
58 auto lb = b.
create<AffineApplyOp>(forOp.getLoc(), lbMap,
59 forOp.getLowerBoundOperands());
68 int64_t step = forOp.getStepAsInt();
69 for (
unsigned i = 0, e = tripCountMap.
getNumResults(); i < e; i++) {
70 auto tripCountExpr = tripCountMap.
getResult(i);
71 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
75 b.
create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
79 for (
unsigned i = 0, e = bumpExprs.size(); i < e; i++)
82 cleanupLbOperands.clear();
83 cleanupLbOperands.push_back(lb);
84 cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
92 for (
auto v : bumpValues)
94 v.getDefiningOp()->erase();
104 auto iterOperands = forOp.getInits();
105 auto iterArgs = forOp.getRegionIterArgs();
106 for (
auto e : llvm::zip(iterOperands, iterArgs))
107 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
110 auto outerResults = forOp.getResults();
111 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
112 for (
auto e : llvm::zip(outerResults, innerResults))
113 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
120 if (!tripCount || *tripCount != 1)
124 if (forOp.getLowerBoundMap().getNumResults() != 1)
128 auto iv = forOp.getInductionVar();
129 auto *parentBlock = forOp->getBlock();
130 if (!iv.use_empty()) {
131 if (forOp.hasConstantLowerBound()) {
132 auto func = forOp->getParentOfType<FunctionOpInterface>();
135 builder.setInsertionPointToStart(&func.getFunctionBody().front());
137 builder.setInsertionPoint(forOp);
139 forOp.getLoc(), forOp.getConstantLowerBound());
140 iv.replaceAllUsesWith(constOp);
142 auto lbOperands = forOp.getLowerBoundOperands();
143 auto lbMap = forOp.getLowerBoundMap();
147 iv.replaceAllUsesWith(lbOperands[0]);
150 builder.
create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
160 forOp.getBody()->back().erase();
162 forOp.getBody()->getOperations());
177 unsigned offset, AffineForOp srcForOp,
OpBuilder b) {
178 auto lbOperands = srcForOp.getLowerBoundOperands();
179 auto ubOperands = srcForOp.getUpperBoundOperands();
185 b.
create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
186 ubMap, srcForOp.getStepAsInt());
187 auto loopChunkIV = loopChunk.getInductionVar();
188 auto srcIV = srcForOp.getInductionVar();
193 for (
const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
194 uint64_t shift = it.first;
195 auto ops = it.second;
200 if (!srcIV.use_empty() && shift != 0) {
201 auto ivRemap = bodyBuilder.create<AffineApplyOp>(
203 bodyBuilder.getSingleDimShiftAffineMap(
204 -
static_cast<int64_t
>(srcForOp.getStepAsInt() * shift)),
206 operandMap.
map(srcIV, ivRemap);
208 operandMap.
map(srcIV, loopChunkIV);
211 bodyBuilder.clone(*op, operandMap);
214 return AffineForOp();
231 bool unrollPrologueEpilogue) {
232 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
233 "too few/many shifts");
234 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
242 if (!mayBeConstTripCount) {
243 LLVM_DEBUG(forOp.emitRemark(
"non-constant trip count loop not handled"));
246 uint64_t tripCount = *mayBeConstTripCount;
249 "shifts will lead to an invalid transformation\n");
251 int64_t step = forOp.getStepAsInt();
253 unsigned numChildOps = shifts.size();
256 uint64_t maxShift = *llvm::max_element(shifts);
257 if (maxShift >= numChildOps) {
259 forOp.emitWarning(
"not shifting because shifts are unrealistically large");
266 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
268 for (
auto &op : forOp.getBody()->without_terminator()) {
269 auto shift = shifts[pos++];
270 sortedOpGroups[shift].push_back(&op);
278 AffineForOp prologue, epilogue;
283 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
285 auto origLbMap = forOp.getLowerBoundMap();
286 uint64_t lbShift = 0;
288 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
290 if (sortedOpGroups[d].empty())
292 if (!opGroupQueue.empty()) {
294 "Queue expected to be empty when the first block is found");
299 if (lbShift + tripCount * step < d * step) {
303 opGroupQueue, 0, forOp, b);
305 opGroupQueue.clear();
306 lbShift += tripCount * step;
310 opGroupQueue, 0, forOp, b);
317 AffineForOp::getCanonicalizationPatterns(
patterns, res.getContext());
320 res.getOperation(), std::move(
patterns),
324 if (!erased && !prologue)
334 opGroupQueue.emplace_back(d, sortedOpGroups[d]);
339 for (
unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
340 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
343 opGroupQueue, i, forOp, b);
352 if (unrollPrologueEpilogue && prologue)
354 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
366 if (input.size() <= 1)
369 LLVM_DEBUG(llvm::dbgs() <<
"Index set computation failed!\n");
373 LLVM_DEBUG(llvm::dbgs()
374 <<
"Non-hyperrectangular nests not supported for tiling!\n");
382 template <
typename t>
385 assert(input.size() == tileSizes.size() &&
"Too few/many tile sizes");
387 if (llvm::any_of(input,
388 [](AffineForOp op) {
return op.getNumResults() > 0; })) {
389 LLVM_DEBUG(llvm::dbgs()
390 <<
"Cannot tile nest where a loop has yield values\n");
396 LLVM_DEBUG(llvm::dbgs() <<
"input loops not perfectly nested");
411 auto &ops = src.getBody()->getOperations();
412 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
413 std::prev(ops.end()));
425 AffineForOp rootAffineForOp,
unsigned width,
427 Location loc = rootAffineForOp.getLoc();
430 Operation *topLoop = rootAffineForOp.getOperation();
431 AffineForOp innermostPointLoop;
434 for (
unsigned i = 0; i < width; i++) {
437 AffineForOp pointLoop = b.
create<AffineForOp>(loc, 0, 0);
438 pointLoop.getBody()->getOperations().splice(
441 tiledLoops[2 * width - 1 - i] = pointLoop;
442 topLoop = pointLoop.getOperation();
444 innermostPointLoop = pointLoop;
448 for (
unsigned i = width; i < 2 * width; i++) {
451 AffineForOp tileSpaceLoop = b.
create<AffineForOp>(loc, 0, 0);
452 tileSpaceLoop.getBody()->getOperations().splice(
455 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
456 topLoop = tileSpaceLoop.getOperation();
466 AffineForOp newInterTileLoop,
467 AffineForOp newIntraTileLoop,
477 assert(origLoop.hasConstantLowerBound() &&
478 "expected input loops to have constant lower bound.");
500 lbOperands.push_back(newInterTileLoop.getInductionVar());
501 ubOperands.push_back(newInterTileLoop.getInductionVar());
515 lbOperands.push_back(tileSize);
516 ubOperands.push_back(tileSize);
528 lbBoundExprs.push_back(
529 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
537 ubBoundExprs.push_back(
538 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
539 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
541 ubBoundExprs.append(origUbMap.
getResults().begin(),
547 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
552 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
555 newIntraTileLoop.setStep(origLoop.getStepAsInt());
561 AffineForOp newLoop,
Value tileSize) {
562 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
566 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
577 assert(origLoop.hasConstantLowerBound() &&
578 "expected input loops to have constant lower bound.");
598 ubOperands.push_back(tileSize);
605 int64_t origUpperBound;
610 if (origLoop.hasConstantUpperBound()) {
611 origUpperBound = origLoop.getConstantUpperBound();
618 boundExprs.push_back(
620 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
641 boundExprs.push_back(
643 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
649 newLoop.setUpperBound(ubOperands, ubMap);
652 newLoop.setStep(origLoop.getStepAsInt());
664 assert(!origLoops.empty() &&
"expected atleast one loop in band");
665 assert(origLoops.size() == tileSizes.size() &&
666 "expected tiling parameter for each loop in band.");
668 OpBuilder b(origLoops[0].getOperation());
669 unsigned width = origLoops.size();
672 for (
unsigned i = 0; i < width; ++i) {
677 for (
unsigned i = 0; i < width; ++i) {
679 newLoops[i + width], tileSizes[i]);
692 assert(!origLoops.empty());
693 assert(origLoops.size() == tileSizes.size());
695 OpBuilder b(origLoops[0].getOperation());
696 unsigned width = origLoops.size();
699 for (
unsigned i = 0; i < width; i++) {
700 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
701 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
702 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
703 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
706 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
709 for (
unsigned i = 0; i < width; i++) {
711 std::optional<uint64_t> mayBeConstantCount =
715 newLoops[width + i].setLowerBound(
716 newLoops[i].getInductionVar(), lbMap);
718 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
721 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
725 *mayBeConstantCount * origLoops[i].getStepAsInt());
726 newLoops[width + i].setUpperBound(
727 newLoops[i].getInductionVar(), ubMap);
728 }
else if (largestDiv % tileSizes[i] != 0) {
744 ubOperands.push_back(newLoops[i].getInductionVar());
755 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
756 boundExprs.append(origUbMap.
getResults().begin(),
761 newLoops[width + i].setUpperBound(ubOperands, ubMap);
766 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
767 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
783 AffineForOp rootAffineForOp = origLoops[0];
786 unsigned width = input.size();
800 for (
unsigned i = 0; i < width; i++)
801 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
804 rootAffineForOp.erase();
807 *tiledNest = std::move(tiledLoops);
825 AffineForOp rootAffineForOp = origLoops[0];
826 unsigned width = input.size();
841 for (
unsigned i = 0; i < width; i++)
842 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
845 rootAffineForOp.erase();
848 *tiledNest = std::move(tiledLoops);
860 nestedLoops.push_back(root);
862 if (body.
begin() != std::prev(body.
end(), 2))
865 root = dyn_cast<AffineForOp>(&body.
front());
878 for (AffineForOp forOp : f.getOps<AffineForOp>()) {
881 bands->push_back(band);
888 if (mayBeConstantTripCount.has_value()) {
889 uint64_t tripCount = *mayBeConstantTripCount;
902 uint64_t unrollFactor) {
904 if (mayBeConstantTripCount.has_value() &&
905 *mayBeConstantTripCount < unrollFactor)
915 Block *loopBodyBlock,
Value forOpIV, uint64_t unrollFactor,
925 annotateFn = defaultAnnotateFn;
934 for (
unsigned i = 1; i < unrollFactor; i++) {
938 operandMap.
map(iterArgs, lastYielded);
943 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
944 operandMap.
map(forOpIV, ivUnroll);
948 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++) {
950 annotateFn(i, clonedOp, builder);
957 for (
unsigned i = 0, e = lastYielded.size(); i < e; i++) {
958 Operation *defOp = yieldedValues[i].getDefiningOp();
959 if (defOp && defOp->
getBlock() == loopBodyBlock)
960 lastYielded[i] = operandMap.
lookup(yieldedValues[i]);
966 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++)
967 annotateFn(0, &*it, builder);
976 uint64_t unrollFactor) {
979 auto cleanupForOp = cast<AffineForOp>(builder.
clone(*forOp));
983 auto results = forOp.getResults();
984 auto cleanupResults = cleanupForOp.getResults();
985 auto cleanupIterOperands = cleanupForOp.getInits();
987 for (
auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
988 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
989 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
998 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
1004 forOp.setUpperBound(cleanupOperands, cleanupMap);
1011 AffineForOp forOp, uint64_t unrollFactor,
1013 bool cleanUpUnroll) {
1014 assert(unrollFactor > 0 &&
"unroll factor should be positive");
1017 if (unrollFactor == 1) {
1018 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1025 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1029 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1030 if (cleanUpUnroll) {
1044 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1045 forOp.getUpperBoundMap().getNumResults() != 1)
1051 assert(
false &&
"cleanup loop lower bound map for single result lower "
1052 "and upper bound maps can always be determined");
1055 ValueRange iterArgs(forOp.getRegionIterArgs());
1056 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1059 int64_t step = forOp.getStepAsInt();
1060 forOp.setStep(step * unrollFactor);
1062 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1065 auto d0 = b.getAffineDimExpr(0);
1066 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1067 return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1070 iterArgs, yieldedValues);
1078 uint64_t unrollJamFactor) {
1080 if (mayBeConstantTripCount.has_value() &&
1081 *mayBeConstantTripCount < unrollJamFactor)
1089 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1090 for (
auto controlOperand : aForOp.getControlOperands()) {
1091 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1092 return WalkResult::interrupt();
1096 return !walkResult.wasInterrupted();
1101 uint64_t unrollJamFactor) {
1102 assert(unrollJamFactor > 0 &&
"unroll jam factor should be positive");
1105 if (unrollJamFactor == 1) {
1106 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1113 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1117 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1118 LLVM_DEBUG(llvm::dbgs() <<
"[failed] trip count < unroll-jam factor\n");
1134 forOp.walk([&](AffineForOp aForOp) {
1135 if (aForOp.getNumIterOperands() > 0)
1136 loopsWithIterArgs.push_back(aForOp);
1141 if (forOp.getNumIterOperands() > 0)
1151 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1152 forOp.getUpperBoundMap().getNumResults() != 1)
1155 assert(
false &&
"cleanup loop lower bound map for single result lower "
1156 "and upper bound maps can always be determined");
1168 for (AffineForOp oldForOp : loopsWithIterArgs) {
1170 ValueRange oldIterOperands = oldForOp.getInits();
1171 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1173 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1176 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1177 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1178 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1182 bool forOpReplaced = oldForOp == forOp;
1183 AffineForOp newForOp =
1184 cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1185 rewriter, dupIterOperands,
false,
1187 return dupYieldOperands;
1189 newLoopsWithIterArgs.push_back(newForOp);
1194 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1195 unsigned oldNumIterArgs = oldIterArgs.size();
1196 ValueRange newResults = newForOp.getResults();
1197 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1198 assert(oldNumIterArgs == oldNumResults &&
1199 "oldNumIterArgs must be the same as oldNumResults");
1200 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1201 for (
unsigned j = 0;
j < oldNumIterArgs; ++
j) {
1205 operandMaps[i - 1].map(newIterArgs[
j],
1206 newIterArgs[i * oldNumIterArgs +
j]);
1207 operandMaps[i - 1].map(newResults[
j],
1208 newResults[i * oldNumResults +
j]);
1214 int64_t step = forOp.getStepAsInt();
1215 forOp.setStep(step * unrollJamFactor);
1217 auto forOpIV = forOp.getInductionVar();
1219 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1220 for (
auto &subBlock : subBlocks) {
1223 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1232 builder.
create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1233 operandMaps[i - 1].map(forOpIV, ivUnroll);
1236 for (
auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1237 builder.
clone(*it, operandMaps[i - 1]);
1240 for (
auto newForOp : newLoopsWithIterArgs) {
1241 unsigned oldNumIterOperands =
1242 newForOp.getNumIterOperands() / unrollJamFactor;
1243 unsigned numControlOperands = newForOp.getNumControlOperands();
1244 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1245 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1246 assert(oldNumIterOperands == oldNumYieldOperands &&
1247 "oldNumIterOperands must be the same as oldNumYieldOperands");
1248 for (
unsigned j = 0;
j < oldNumIterOperands; ++
j) {
1252 newForOp.setOperand(numControlOperands + i * oldNumIterOperands +
j,
1253 operandMaps[i - 1].lookupOrDefault(
1254 newForOp.getOperand(numControlOperands +
j)));
1256 i * oldNumYieldOperands +
j,
1257 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(
j)));
1261 if (forOp.getNumResults() > 0) {
1267 auto loc = forOp.getLoc();
1268 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1270 unsigned pos = reduction.iterArgPosition;
1271 Value lhs = forOp.getResult(pos);
1274 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1275 rhs = forOp.getResult(i * oldNumResults + pos);
1281 assert(op &&
"Reduction op should have been created");
1285 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1297 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1298 auto &forOpABody = forOpA.getBody()->getOperations();
1299 auto &forOpBBody = forOpB.getBody()->getOperations();
1305 forOpABody, forOpABody.begin(),
1306 std::prev(forOpABody.end()));
1309 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1310 std::prev(forOpBBody.end()));
1312 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1323 unsigned maxLoopDepth = loops.size();
1325 loopPermMapInv.resize(maxLoopDepth);
1326 for (
unsigned i = 0; i < maxLoopDepth; ++i)
1327 loopPermMapInv[loopPermMap[i]] = i;
1334 for (
const auto &depComps : depCompsVec) {
1335 assert(depComps.size() >= maxLoopDepth);
1338 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1339 unsigned permIndex = loopPermMapInv[
j];
1340 assert(depComps[permIndex].lb);
1341 int64_t depCompLb = *depComps[permIndex].lb;
1355 assert(loopPermMap.size() == loops.size() &&
"invalid loop perm map");
1356 unsigned maxLoopDepth = loops.size();
1357 if (maxLoopDepth == 1)
1361 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1368 bool LLVM_ATTRIBUTE_UNUSED
1370 assert(!loops.empty() &&
"no loops provided");
1373 auto hasTwoElements = [](
Block *block) {
1374 auto secondOpIt = std::next(block->begin());
1375 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1378 auto enclosingLoop = loops.front();
1379 for (
auto loop : loops.drop_front()) {
1380 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1382 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1384 enclosingLoop = loop;
1393 assert(input.size() == permMap.size() &&
"invalid permutation map size");
1397 llvm::sort(checkPermMap);
1399 [](
const auto &en) {
return en.value() != en.index(); }))
1400 assert(
false &&
"invalid permutation map");
1403 if (input.size() < 2)
1411 for (
unsigned i = 0, e = input.size(); i < e; ++i)
1412 invPermMap.push_back({permMap[i], i});
1413 llvm::sort(invPermMap);
1417 if (permMap.back() != input.size() - 1) {
1418 Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1419 Block *srcBody = ((AffineForOp)input.back()).getBody();
1422 std::prev(srcBody->
end()));
1427 for (
int i = input.size() - 1; i >= 0; --i) {
1430 if (permMap[i] == 0) {
1435 auto *parentBlock = input[0]->getBlock();
1437 input[i]->getBlock()->getOperations(),
1444 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1445 if (i > 0 &&
static_cast<unsigned>(i - 1) == parentPosInInput)
1449 auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1450 destBody->getOperations().splice(destBody->begin(),
1451 input[i]->getBlock()->getOperations(),
1455 return invPermMap[0].second;
1464 if (loops.size() < 2)
1469 unsigned maxLoopDepth = loops.size();
1470 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1475 for (
auto &depComps : depCompsVec) {
1476 assert(depComps.size() >= maxLoopDepth);
1477 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1479 assert(depComp.
lb.has_value() && depComp.
ub.has_value());
1480 if (*depComp.
lb != 0 || *depComp.
ub != 0)
1490 unsigned nextSequentialLoop = numParallelLoops;
1491 unsigned nextParallelLoop = 0;
1492 for (
unsigned i = 0; i < maxLoopDepth; ++i) {
1494 loopPermMap[i] = nextParallelLoop++;
1496 loopPermMap[i] = nextSequentialLoop++;
1504 unsigned loopNestRootIndex =
permuteLoops(loops, loopPermMap);
1505 return loops[loopNestRootIndex];
1519 int64_t offset = 0) {
1520 auto bounds = llvm::to_vector<4>(map->
getResults());
1522 operands->insert(operands->begin() + map->
getNumDims(), iv);
1539 auto originalStep = forOp.getStepAsInt();
1540 auto scaledStep = originalStep * factor;
1541 forOp.setStep(scaledStep);
1546 auto lbMap = forOp.getLowerBoundMap();
1551 auto ubMap = forOp.getUpperBoundMap();
1556 auto iv = forOp.getInductionVar();
1558 for (
auto t : targets) {
1561 auto newForOp = b.
create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1562 ubOperands, ubMap, originalStep);
1563 auto begin = t.getBody()->begin();
1565 auto nOps = t.getBody()->getOperations().size() - 2;
1566 newForOp.getBody()->getOperations().splice(
1567 newForOp.getBody()->getOperations().begin(),
1568 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1570 newForOp.getRegion());
1571 innerLoops.push_back(newForOp);
1579 template <
typename SizeType>
1581 AffineForOp target) {
1587 assert(res.size() == 1 &&
"Expected 1 inner forOp");
1596 for (
auto it : llvm::zip(forOps, sizes)) {
1597 auto step =
stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1598 res.push_back(step);
1599 currentTargets = step;
1606 AffineForOp target) {
1609 res.push_back(llvm::getSingleElement(loops));
1614 if (loops.size() < 2)
1617 AffineForOp innermost = loops.back();
1618 AffineForOp outermost = loops.front();
1623 for (AffineForOp loop : loops) {
1625 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1626 loop.getConstantLowerBound() != 0)
1635 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1636 prev = builder.
create<AffineMinOp>(loc, origUbMap, ubOperands);
1638 prev = builder.
create<AffineApplyOp>(loc, origUbMap, ubOperands);
1639 upperBoundSymbols.push_back(prev);
1643 for (AffineForOp loop : loops.drop_front()) {
1644 ub = loop.getUpperBound();
1649 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1650 upperBound = builder.
create<AffineMinOp>(loc, origUbMap, ubOperands);
1652 upperBound = builder.
create<AffineApplyOp>(loc, origUbMap, ubOperands);
1653 upperBoundSymbols.push_back(upperBound);
1655 operands.push_back(prev);
1656 operands.push_back(upperBound);
1658 prev = builder.
create<AffineApplyOp>(
1670 outermost.setUpperBound(prev, newUbMap);
1682 Value previous = outermost.getInductionVar();
1683 for (
unsigned idx = loops.size(); idx > 0; --idx) {
1684 if (idx != loops.size()) {
1686 operands.push_back(previous);
1687 operands.push_back(upperBoundSymbols[idx]);
1688 previous = builder.
create<AffineApplyOp>(
1698 Value inductionVariable;
1700 inductionVariable = previous;
1703 applyOperands.push_back(previous);
1704 applyOperands.push_back(upperBoundSymbols[idx - 1]);
1705 inductionVariable = builder.
create<AffineApplyOp>(
1713 inductionVariable, loops.back().getRegion());
1718 AffineForOp secondOutermostLoop = loops[1];
1719 innermost.getBody()->back().erase();
1720 outermost.getBody()->getOperations().splice(
1722 innermost.getBody()->getOperations());
1723 secondOutermostLoop.erase();
1730 assert(processorId.size() == numProcessors.size());
1731 if (processorId.empty())
1741 Value linearIndex = processorId.front();
1742 for (
unsigned i = 1, e = processorId.size(); i < e; ++i) {
1743 auto mulApplyOp = b.
create<AffineApplyOp>(
1744 loc, mulMap,
ValueRange{linearIndex, numProcessors[i]});
1745 linearIndex = b.
create<AffineApplyOp>(
1746 loc, addMap,
ValueRange{mulApplyOp, processorId[i]});
1749 auto mulApplyOp = b.
create<AffineApplyOp>(
1750 loc, mulMap,
ValueRange{linearIndex, forOp.getStep()});
1752 loc, addMap,
ValueRange{mulApplyOp, forOp.getLowerBound()});
1753 forOp.setLowerBound(lb);
1755 Value step = forOp.getStep();
1756 for (
auto numProcs : numProcessors)
1758 forOp.setStep(step);
1769 Block **copyPlacementBlock,
1774 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1780 auto it = enclosingAffineOps.rbegin();
1781 AffineForOp lastInvariantFor;
1782 for (
auto e = enclosingAffineOps.rend(); it != e; ++it) {
1789 <<
"memref definition will end up not dominating hoist location\n");
1793 auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1798 if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1800 lastInvariantFor = affineFor;
1803 if (it != enclosingAffineOps.rbegin()) {
1805 *copyOutPlacementStart = std::next(*copyInPlacementStart);
1806 *copyPlacementBlock = lastInvariantFor->getBlock();
1808 *copyInPlacementStart = begin;
1809 *copyOutPlacementStart = end;
1810 *copyPlacementBlock = █
1828 if (bufferShape.size() <= 1)
1831 int64_t numEltPerStride = 1;
1833 for (
int d = bufferShape.size() - 1; d >= 1; d--) {
1834 int64_t dimSize = cast<MemRefType>(region.
memref.
getType()).getDimSize(d);
1836 numEltPerStride *= bufferShape[d];
1840 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1841 strideInfos->push_back({stride, numEltPerStride});
1866 assert(llvm::all_of(lbMaps, [&](
AffineMap lbMap) {
1869 assert(llvm::all_of(ubMaps, [&](
AffineMap ubMap) {
1873 unsigned rank = cast<MemRefType>(memref.
getType()).getRank();
1875 assert(rank != 0 &&
"non-zero rank memref expected");
1876 assert(lbMaps.size() == rank &&
"wrong number of lb maps");
1877 assert(ubMaps.size() == rank &&
"wrong number of ub maps");
1882 AffineForOp copyNestRoot;
1884 for (
unsigned d = 0; d < rank; ++d) {
1886 ubOperands, ubMaps[d]);
1888 copyNestRoot = forOp;
1892 auto fastBufOffsetMap =
1894 auto offset = b.
create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1900 fastBufMapOperands.push_back(offset);
1901 fastBufMapOperands.push_back(forOp.getInductionVar());
1902 mayBeDeadApplys.push_back(offset);
1905 memIndices.push_back(forOp.getInductionVar());
1915 for (
auto applyOp : mayBeDeadApplys)
1916 if (applyOp.use_empty())
1921 auto load = b.
create<AffineLoadOp>(loc, memref, memIndices);
1922 b.
create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1923 fastBufMapOperands);
1924 return copyNestRoot;
1929 b.
create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1930 b.
create<AffineStoreOp>(loc, load, memref, memIndices);
1931 return copyNestRoot;
1963 auto f = begin->getParentOfType<FunctionOpInterface>();
1964 OpBuilder topBuilder(f.getFunctionBody());
1965 Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1974 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1977 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1979 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1987 auto loc = region.
loc;
1988 auto memref = region.
memref;
1989 auto memRefType = cast<MemRefType>(memref.getType());
1991 if (!memRefType.getLayout().isIdentity()) {
1992 LLVM_DEBUG(llvm::dbgs() <<
"Non-identity layout map not yet supported\n");
2002 unsigned rank = memRefType.getRank();
2004 LLVM_DEBUG(llvm::dbgs() <<
"Non-zero ranked memrefs supported\n");
2013 std::optional<int64_t> numElements =
2016 LLVM_DEBUG(llvm::dbgs() <<
"Non-constant region size not supported\n");
2022 LLVM_DEBUG(llvm::dbgs()
2023 <<
"Max lower bound for memref region start not supported\n");
2027 if (*numElements == 0) {
2028 LLVM_DEBUG(llvm::dbgs() <<
"Nothing to copy\n");
2033 for (
unsigned i = 0; i < rank; ++i) {
2035 if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2036 LLVM_DEBUG(llvm::dbgs()
2037 <<
"Missing lower or upper bound for region along dimension: "
2057 fastBufOffsets.reserve(rank);
2058 for (
unsigned d = 0; d < rank; d++) {
2059 assert(lbs[d].getNumSymbols() == cst->
getNumCols() - rank - 1 &&
2060 "incorrect bound size");
2064 if (lbs[d].isSingleConstant()) {
2065 auto indexVal = lbs[d].getSingleConstantResult();
2066 if (indexVal == 0) {
2067 memIndices.push_back(zeroIndex);
2069 memIndices.push_back(
2070 top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2079 for (
unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2080 symReplacements[i] = top.getAffineDimExpr(i);
2081 lbs[d] = lbs[d].replaceDimsAndSymbols(
2082 {}, symReplacements, lbs[d].getNumSymbols(),
2084 memIndices.push_back(b.
create<AffineApplyOp>(loc, lbs[d], regionSymbols));
2087 bufIndices.push_back(zeroIndex);
2091 fastBufOffsets.push_back(lbs[d].getResult(0));
2098 bool existingBuf = fastBufferMap.count(memref) > 0;
2101 auto fastMemRefType =
2108 prologue.
create<memref::AllocOp>(loc, fastMemRefType).getResult();
2110 fastBufferMap[memref] = fastMemRef;
2114 *sizeInBytes = maySizeInBytes.value_or(0);
2117 <<
"Creating fast buffer of type " << fastMemRefType
2122 fastMemRef = fastBufferMap[memref];
2125 auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2128 Value numEltPerDmaStride;
2135 if (dmaStrideInfos.size() > 1) {
2136 LLVM_DEBUG(llvm::dbgs() <<
"Only up to one level of stride supported\n");
2140 if (!dmaStrideInfos.empty()) {
2142 top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2143 numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2144 loc, dmaStrideInfos[0].numEltPerStride);
2152 auto postDomFilter = std::prev(end);
2164 regionSymbols, ubMaps,
2165 regionSymbols, fastBufOffsets,
2169 copyNests.insert(copyNest);
2173 if (region.
isWrite() && isCopyOutAtEndOfBlock)
2180 auto tagMemRef = prologue.
create<memref::AllocOp>(loc, tagMemRefType);
2188 fastMemRef, bufAffineMap, bufIndices,
2189 tagMemRef, tagAffineMap, tagIndices,
2190 numElementsSSA, dmaStride, numEltPerDmaStride);
2194 loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2195 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2196 dmaStride, numEltPerDmaStride);
2199 if (isCopyOutAtEndOfBlock)
2208 auto tagDeallocOp = epilogue.
create<memref::DeallocOp>(loc, tagMemRef);
2209 if (*nEnd == end && isCopyOutAtEndOfBlock)
2217 auto bufDeallocOp = epilogue.
create<memref::DeallocOp>(loc, fastMemRef);
2220 if (!copyOptions.
generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2232 remapExprs.reserve(rank);
2233 for (
unsigned i = 0; i < rank; i++) {
2238 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2240 auto indexRemap =
AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2245 bool isBeginAtStartOfBlock = (begin == block->
begin());
2246 if (!isBeginAtStartOfBlock)
2247 prevOfBegin = std::prev(begin);
2257 *nBegin = isBeginAtStartOfBlock ? block->
begin() : std::next(prevOfBegin);
2269 if (
auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2270 rank = loadOp.getMemRefType().getRank();
2271 region->
memref = loadOp.getMemRef();
2273 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2274 rank = storeOp.getMemRefType().getRank();
2275 region->
memref = storeOp.getMemRef();
2278 assert(
false &&
"expected load or store op");
2281 auto memRefType = cast<MemRefType>(region->
memref.
getType());
2282 if (!memRefType.hasStaticShape())
2291 ivs.resize(numParamLoopIVs);
2295 regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2298 for (
unsigned d = 0; d < rank; d++) {
2299 auto dimSize = memRefType.getDimSize(d);
2300 assert(dimSize > 0 &&
"filtered dynamic shapes above");
2301 regionCst->addBound(BoundType::LB, d, 0);
2302 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2310 std::optional<Value> filterMemRef,
2315 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2316 "Inconsistent block begin/end args");
2317 assert(end != end->getBlock()->end() &&
"end can't be the block terminator");
2319 Block *block = begin->getBlock();
2325 LLVM_DEBUG(llvm::dbgs() <<
"Generating copies at depth " << copyDepth
2327 LLVM_DEBUG(llvm::dbgs() <<
"from begin: " << *begin <<
"\n");
2328 LLVM_DEBUG(llvm::dbgs() <<
"to inclusive end: " << *std::prev(end) <<
"\n");
2333 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2334 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2346 MemRefType memrefType;
2348 if (
auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2349 memref = loadOp.getMemRef();
2350 memrefType = loadOp.getMemRefType();
2351 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2352 memref = storeOp.getMemRef();
2353 memrefType = storeOp.getMemRefType();
2359 if ((filterMemRef.has_value() && filterMemRef != memref) ||
2360 (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2365 LLVM_DEBUG(llvm::dbgs() <<
"memref definition is inside of the depth at "
2366 "which copy-in/copy-out would happen\n");
2371 auto region = std::make_unique<MemRefRegion>(opInst->
getLoc());
2372 if (failed(region->compute(opInst, copyDepth,
nullptr,
2374 LLVM_DEBUG(llvm::dbgs()
2375 <<
"Error obtaining memory region: semi-affine maps?\n");
2376 LLVM_DEBUG(llvm::dbgs() <<
"over-approximating to the entire memref\n");
2377 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2379 opInst->emitError(
"non-constant memref sizes not yet supported"));
2400 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2402 const auto *
const it = targetRegions.find(region->memref);
2403 if (it == targetRegions.end())
2407 if (failed(it->second->unionBoundingBox(*region))) {
2408 LLVM_DEBUG(llvm::dbgs()
2409 <<
"Memory region bounding box failed; "
2410 "over-approximating to the entire memref\n");
2414 "non-constant memref sizes not yet supported"));
2418 it->second->getConstraints()->clearAndCopyFrom(
2419 *region->getConstraints());
2422 region->getConstraints()->clearAndCopyFrom(
2423 *it->second->getConstraints());
2428 bool existsInRead = updateRegion(readRegions);
2431 bool existsInWrite = updateRegion(writeRegions);
2436 if (region->isWrite() && !existsInWrite) {
2437 writeRegions[region->memref] = std::move(region);
2438 }
else if (!region->isWrite() && !existsInRead) {
2439 readRegions[region->memref] = std::move(region);
2444 LLVM_DEBUG(begin->emitError(
2445 "copy generation failed for one or more memref's in this block\n"));
2449 uint64_t totalCopyBuffersSizeInBytes = 0;
2451 auto processRegions =
2452 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2454 for (
const auto ®ionEntry : regions) {
2458 Block *copyPlacementBlock;
2460 *regionEntry.second, *block, begin, end, ©PlacementBlock,
2461 ©InPlacementStart, ©OutPlacementStart);
2463 uint64_t sizeInBytes;
2466 *regionEntry.second, block, begin, end, copyPlacementBlock,
2467 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2468 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2469 if (succeeded(iRet)) {
2473 totalCopyBuffersSizeInBytes += sizeInBytes;
2475 ret = ret & succeeded(iRet);
2478 processRegions(readRegions);
2479 processRegions(writeRegions);
2482 LLVM_DEBUG(begin->emitError(
2483 "copy generation failed for one or more memref's in this block\n"));
2489 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2490 LLVM_DEBUG(forOp.emitRemark()
2492 <<
" KiB of copy buffers in fast memory space for this block");
2495 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2496 block->getParentOp()->emitWarning(
2497 "total size of all copy buffers' for this block exceeds fast memory "
2510 std::prev(forOp.getBody()->end()), copyOptions,
2511 filterMemRef, copyNests);
2518 auto begin = analyzedOp->getIterator();
2519 auto end = std::next(begin);
2523 auto err =
generateCopy(memrefRegion, block, begin, end, block, begin, end,
2524 copyOptions, fastBufferMap, copyNests,
2529 const auto &en = fastBufferMap.find(memrefRegion.
memref);
2531 if (en == fastBufferMap.end())
2533 result.
alloc = en->second.getDefiningOp();
2534 assert(result.
alloc &&
"fast buffer expected to be locally allocated");
2535 assert(copyNests.size() <= 1 &&
"At most one copy nest is expected.");
2536 result.
copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2545 assert(currLoopDepth <= depthToLoops.size() &&
"Unexpected currLoopDepth");
2546 if (currLoopDepth == depthToLoops.size())
2547 depthToLoops.emplace_back();
2549 for (
auto &op : *block) {
2550 if (
auto forOp = dyn_cast<AffineForOp>(op)) {
2551 depthToLoops[currLoopDepth].push_back(forOp);
2560 for (
auto &block : func)
2564 if (!depthToLoops.empty()) {
2565 assert(depthToLoops.back().empty() &&
"Last loop level is not empty?");
2566 depthToLoops.pop_back();
2583 return b.
create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2597 auto *context = loops[0].getContext();
2601 llvm::append_range(ops, loops);
2611 for (
auto loop : loops) {
2614 assert(loop.getStepAsInt() == 1 &&
"point loop step expected to be one");
2618 unsigned fullTileLbPos, fullTileUbPos;
2620 .getConstantBoundOnDimSize(0,
nullptr,
2622 nullptr, &fullTileLbPos,
2624 LLVM_DEBUG(llvm::dbgs() <<
"Can't get constant diff pair for a loop\n");
2633 fullTileLb.assign(fLb.begin(), fLb.end());
2634 fullTileUb.assign(fUb.begin(), fUb.end());
2637 for (
auto lbIndex : lbIndices)
2638 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2639 cst.
atIneq(lbIndex, i) = fullTileLb[i] - cst.
atIneq(lbIndex, i);
2642 for (
auto ubIndex : ubIndices)
2643 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2644 cst.
atIneq(ubIndex, i) -= fullTileUb[i];
2667 return b.
create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2672 static LogicalResult
2675 fullTileLoops.reserve(inputNest.size());
2680 for (
auto loop : inputNest) {
2682 if (loop.getStepAsInt() != 1) {
2683 LLVM_DEBUG(llvm::dbgs()
2684 <<
"[tile separation] non-unit stride not implemented\n");
2692 unsigned lbPos, ubPos;
2694 .getConstantBoundOnDimSize(0,
nullptr,
2696 nullptr, &lbPos, &ubPos) ||
2698 LLVM_DEBUG(llvm::dbgs() <<
"[tile separation] Can't get constant diff / "
2699 "equalities not yet handled\n");
2708 cst.getIneqAsAffineValueMap(0, lbPos, lbVmap, b.
getContext());
2709 cst.getIneqAsAffineValueMap(0, ubPos, ubVmap, b.
getContext());
2714 fullTileLoops.push_back(fullTileLoop);
2720 operandMap.
map(loopEn.value().getInductionVar(),
2721 fullTileLoops[loopEn.index()].getInductionVar());
2723 for (
auto &op : inputNest.back().getBody()->without_terminator())
2724 b.
clone(op, operandMap);
2731 if (inputNest.empty())
2734 auto firstLoop = inputNest[0];
2737 auto prevLoop = firstLoop;
2738 for (
auto loop : inputNest.drop_front(1)) {
2739 assert(loop->getParentOp() == prevLoop &&
"input not contiguously nested");
2747 if (!fullTileLoops.empty())
2748 fullTileLoops.front().erase();
2756 fullTileLoops.front().erase();
2757 LLVM_DEBUG(llvm::dbgs() <<
"All tiles are full tiles, or failure creating "
2758 "separation condition\n");
2763 Block *thenBlock = ifOp.getThenBlock();
2764 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2766 std::prev(thenBlock->
end()),
2767 outermostFullTileLoop->getBlock()->getOperations(),
2772 Block *elseBlock = ifOp.getElseBlock();
2774 firstLoop->getBlock()->getOperations(),
2778 *fullTileNest = std::move(fullTileLoops);
2784 LogicalResult result(failure());
2787 if (loops.size() <= 1)
2795 for (
unsigned i = 0, e = loops.size(); i < e; ++i) {
2796 operandsDefinedAbove[i] = i;
2797 for (
unsigned j = 0;
j < i; ++
j) {
2799 operandsDefinedAbove[i] =
j;
2808 for (
unsigned end = loops.size(); end > 0; --end) {
2810 for (; start < end - 1; ++start) {
2812 *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2813 std::next(operandsDefinedAbove.begin(), end));
2816 assert(maxPos == start &&
2817 "expected loop bounds to be known at the start of the band");
2825 if (start != end - 1)
2834 while (
auto loopOp = currentOp->
getParentOfType<LoopLikeOpInterface>()) {
2835 if (!loopOp.isDefinedOutsideOfLoop(operand.
get()))
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
static void findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
static LogicalResult generateCopy(const MemRefRegion ®ion, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions ©Options, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > ©Nests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region (memref has to be non-zer...
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
static void getMultiLevelStrides(const MemRefRegion ®ion, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to a non-zero ranked ‘memref’ to/from ‘fastMemRef’ and returns the o...
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
unsigned getNumDims() const
ArrayRef< AffineExpr > getResults() const
unsigned getNumResults() const
unsigned getNumInputs() const
AffineExpr getResult(unsigned idx) const
Block represents an ordered list of Operations.
OpListType::iterator iterator
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Operation * getTerminator()
Get the terminator operation of this block.
OpListType & getOperations()
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
AffineMap getDimIdentityMap()
AffineMap getMultiDimIdentityMap(unsigned rank)
AffineExpr getAffineSymbolExpr(unsigned position)
AffineExpr getAffineConstantExpr(int64_t constant)
AffineExpr getAffineDimExpr(unsigned position)
MLIRContext * getContext() const
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteConfig & setStrictness(GreedyRewriteStrictness mode)
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
IRValueT get() const
Return the current value being used by this operand.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class represents a diagnostic that is inflight and set to be reported.
An integer set representing a conjunction of one or more affine equalities and inequalities.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents an operand of an operation.
Operation * getOperation()
Inherit getOperation from OpState.
This class implements the operand iterators for the Operation class.
Operation is the basic unit of execution within MLIR.
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Location getLoc()
The source location the operation was defined or derived from.
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Block * getBlock()
Returns the operation block that contains this operation.
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Region * getParentRegion()
Returns the region to which the instruction belongs.
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
bool isAncestor(Region *other)
Return true if this region is ancestor of the other region.
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
bool use_empty() const
Returns true if this value has no uses.
Type getType() const
Return the type of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Region * getParentRegion()
Return the Region in which this Value is defined.
static WalkResult advance()
AffineBound represents a lower or upper bound in the for operation.
Value getOperand(unsigned idx)
operand_range getOperands()
unsigned getNumOperands()
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
AffineMap getAffineMap() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
Specialization of arith.constant op that returns an integer of index type.
Operation * getOwner() const
Return the owner of this operand.
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
unsigned getNumVars() const
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumDimAndSymbolVars() const
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, std::optional< Value > filterMemRef, DenseSet< Operation * > ©Nests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions ©Options, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Count the number of loops surrounding operand such that operand could be hoisted above.
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
AffineForOp sinkSequentialLoops(AffineForOp forOp)
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region ®ion)
Replace all uses of orig within the given region with replacement.
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Result for calling generateCopyForMemRegion.
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
FlatAffineValueConstraints * getConstraints()
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Value memref
Memref that this region corresponds to.
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.