25 #include "llvm/ADT/MapVector.h"
26 #include "llvm/ADT/SmallPtrSet.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
31 #define DEBUG_TYPE "loop-utils"
34 using namespace affine;
35 using namespace presburger;
36 using llvm::SmallMapVector;
57 auto lbMap = forOp.getLowerBoundMap();
58 auto lb = b.
create<AffineApplyOp>(forOp.getLoc(), lbMap,
59 forOp.getLowerBoundOperands());
68 int64_t step = forOp.getStepAsInt();
69 for (
unsigned i = 0, e = tripCountMap.
getNumResults(); i < e; i++) {
70 auto tripCountExpr = tripCountMap.
getResult(i);
71 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
75 b.
create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
79 for (
unsigned i = 0, e = bumpExprs.size(); i < e; i++)
82 cleanupLbOperands.clear();
83 cleanupLbOperands.push_back(lb);
84 cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
92 for (
auto v : bumpValues)
94 v.getDefiningOp()->erase();
104 auto iterOperands = forOp.getInits();
105 auto iterArgs = forOp.getRegionIterArgs();
106 for (
auto e : llvm::zip(iterOperands, iterArgs))
107 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
110 auto outerResults = forOp.getResults();
111 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
112 for (
auto e : llvm::zip(outerResults, innerResults))
113 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
120 if (!tripCount || *tripCount != 1)
124 if (forOp.getLowerBoundMap().getNumResults() != 1)
128 auto iv = forOp.getInductionVar();
129 auto *parentBlock = forOp->getBlock();
130 if (!iv.use_empty()) {
131 if (forOp.hasConstantLowerBound()) {
132 auto func = forOp->getParentOfType<FunctionOpInterface>();
135 builder.setInsertionPointToStart(&func.getFunctionBody().front());
137 builder.setInsertionPoint(forOp);
139 forOp.getLoc(), forOp.getConstantLowerBound());
140 iv.replaceAllUsesWith(constOp);
142 auto lbOperands = forOp.getLowerBoundOperands();
143 auto lbMap = forOp.getLowerBoundMap();
147 iv.replaceAllUsesWith(lbOperands[0]);
150 builder.
create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
160 forOp.getBody()->back().erase();
162 forOp.getBody()->getOperations());
177 unsigned offset, AffineForOp srcForOp,
OpBuilder b) {
178 auto lbOperands = srcForOp.getLowerBoundOperands();
179 auto ubOperands = srcForOp.getUpperBoundOperands();
185 b.
create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
186 ubMap, srcForOp.getStepAsInt());
187 auto loopChunkIV = loopChunk.getInductionVar();
188 auto srcIV = srcForOp.getInductionVar();
193 for (
const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
194 uint64_t shift = it.first;
195 auto ops = it.second;
200 if (!srcIV.use_empty() && shift != 0) {
201 auto ivRemap = bodyBuilder.create<AffineApplyOp>(
203 bodyBuilder.getSingleDimShiftAffineMap(
204 -
static_cast<int64_t
>(srcForOp.getStepAsInt() * shift)),
206 operandMap.
map(srcIV, ivRemap);
208 operandMap.
map(srcIV, loopChunkIV);
211 bodyBuilder.clone(*op, operandMap);
214 return AffineForOp();
231 bool unrollPrologueEpilogue) {
232 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
233 "too few/many shifts");
234 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
242 if (!mayBeConstTripCount) {
243 LLVM_DEBUG(forOp.emitRemark(
"non-constant trip count loop not handled"));
246 uint64_t tripCount = *mayBeConstTripCount;
249 "shifts will lead to an invalid transformation\n");
251 int64_t step = forOp.getStepAsInt();
253 unsigned numChildOps = shifts.size();
256 uint64_t maxShift = *llvm::max_element(shifts);
257 if (maxShift >= numChildOps) {
259 forOp.emitWarning(
"not shifting because shifts are unrealistically large");
266 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
268 for (
auto &op : forOp.getBody()->without_terminator()) {
269 auto shift = shifts[pos++];
270 sortedOpGroups[shift].push_back(&op);
278 AffineForOp prologue, epilogue;
283 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
285 auto origLbMap = forOp.getLowerBoundMap();
286 uint64_t lbShift = 0;
288 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
290 if (sortedOpGroups[d].empty())
292 if (!opGroupQueue.empty()) {
294 "Queue expected to be empty when the first block is found");
299 if (lbShift + tripCount * step < d * step) {
303 opGroupQueue, 0, forOp, b);
305 opGroupQueue.clear();
306 lbShift += tripCount * step;
310 opGroupQueue, 0, forOp, b);
317 AffineForOp::getCanonicalizationPatterns(
patterns, res.getContext());
322 config,
nullptr, &erased);
323 if (!erased && !prologue)
333 opGroupQueue.emplace_back(d, sortedOpGroups[d]);
338 for (
unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
339 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
342 opGroupQueue, i, forOp, b);
351 if (unrollPrologueEpilogue && prologue)
353 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
365 if (input.size() <= 1)
368 LLVM_DEBUG(llvm::dbgs() <<
"Index set computation failed!\n");
372 LLVM_DEBUG(llvm::dbgs()
373 <<
"Non-hyperrectangular nests not supported for tiling!\n");
381 template <
typename t>
384 assert(input.size() == tileSizes.size() &&
"Too few/many tile sizes");
386 if (llvm::any_of(input,
387 [](AffineForOp op) {
return op.getNumResults() > 0; })) {
388 LLVM_DEBUG(llvm::dbgs()
389 <<
"Cannot tile nest where a loop has yield values\n");
395 LLVM_DEBUG(llvm::dbgs() <<
"input loops not perfectly nested");
410 auto &ops = src.getBody()->getOperations();
411 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
412 std::prev(ops.end()));
424 AffineForOp rootAffineForOp,
unsigned width,
426 Location loc = rootAffineForOp.getLoc();
429 Operation *topLoop = rootAffineForOp.getOperation();
430 AffineForOp innermostPointLoop;
433 for (
unsigned i = 0; i < width; i++) {
436 AffineForOp pointLoop = b.
create<AffineForOp>(loc, 0, 0);
437 pointLoop.getBody()->getOperations().splice(
440 tiledLoops[2 * width - 1 - i] = pointLoop;
441 topLoop = pointLoop.getOperation();
443 innermostPointLoop = pointLoop;
447 for (
unsigned i = width; i < 2 * width; i++) {
450 AffineForOp tileSpaceLoop = b.
create<AffineForOp>(loc, 0, 0);
451 tileSpaceLoop.getBody()->getOperations().splice(
454 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
455 topLoop = tileSpaceLoop.getOperation();
465 AffineForOp newInterTileLoop,
466 AffineForOp newIntraTileLoop,
476 assert(origLoop.hasConstantLowerBound() &&
477 "expected input loops to have constant lower bound.");
499 lbOperands.push_back(newInterTileLoop.getInductionVar());
500 ubOperands.push_back(newInterTileLoop.getInductionVar());
514 lbOperands.push_back(tileSize);
515 ubOperands.push_back(tileSize);
527 lbBoundExprs.push_back(
528 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
536 ubBoundExprs.push_back(
537 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
538 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
540 ubBoundExprs.append(origUbMap.
getResults().begin(),
546 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
551 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
554 newIntraTileLoop.setStep(origLoop.getStepAsInt());
560 AffineForOp newLoop,
Value tileSize) {
561 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
565 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
576 assert(origLoop.hasConstantLowerBound() &&
577 "expected input loops to have constant lower bound.");
597 ubOperands.push_back(tileSize);
604 int64_t origUpperBound;
609 if (origLoop.hasConstantUpperBound()) {
610 origUpperBound = origLoop.getConstantUpperBound();
617 boundExprs.push_back(
619 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
640 boundExprs.push_back(
642 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
648 newLoop.setUpperBound(ubOperands, ubMap);
651 newLoop.setStep(origLoop.getStepAsInt());
663 assert(!origLoops.empty() &&
"expected atleast one loop in band");
664 assert(origLoops.size() == tileSizes.size() &&
665 "expected tiling parameter for each loop in band.");
667 OpBuilder b(origLoops[0].getOperation());
668 unsigned width = origLoops.size();
671 for (
unsigned i = 0; i < width; ++i) {
676 for (
unsigned i = 0; i < width; ++i) {
678 newLoops[i + width], tileSizes[i]);
691 assert(!origLoops.empty());
692 assert(origLoops.size() == tileSizes.size());
694 OpBuilder b(origLoops[0].getOperation());
695 unsigned width = origLoops.size();
698 for (
unsigned i = 0; i < width; i++) {
699 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
700 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
701 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
702 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
705 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
708 for (
unsigned i = 0; i < width; i++) {
710 std::optional<uint64_t> mayBeConstantCount =
714 newLoops[width + i].setLowerBound(
715 newLoops[i].getInductionVar(), lbMap);
717 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
720 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
724 *mayBeConstantCount * origLoops[i].getStepAsInt());
725 newLoops[width + i].setUpperBound(
726 newLoops[i].getInductionVar(), ubMap);
727 }
else if (largestDiv % tileSizes[i] != 0) {
743 ubOperands.push_back(newLoops[i].getInductionVar());
754 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
755 boundExprs.append(origUbMap.
getResults().begin(),
760 newLoops[width + i].setUpperBound(ubOperands, ubMap);
765 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
766 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
782 AffineForOp rootAffineForOp = origLoops[0];
785 unsigned width = input.size();
799 for (
unsigned i = 0; i < width; i++)
800 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
803 rootAffineForOp.erase();
806 *tiledNest = std::move(tiledLoops);
824 AffineForOp rootAffineForOp = origLoops[0];
825 unsigned width = input.size();
840 for (
unsigned i = 0; i < width; i++)
841 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
844 rootAffineForOp.erase();
847 *tiledNest = std::move(tiledLoops);
859 nestedLoops.push_back(root);
861 if (body.
begin() != std::prev(body.
end(), 2))
864 root = dyn_cast<AffineForOp>(&body.
front());
877 for (AffineForOp forOp : f.getOps<AffineForOp>()) {
880 bands->push_back(band);
887 if (mayBeConstantTripCount.has_value()) {
888 uint64_t tripCount = *mayBeConstantTripCount;
901 uint64_t unrollFactor) {
903 if (mayBeConstantTripCount.has_value() &&
904 *mayBeConstantTripCount < unrollFactor)
914 Block *loopBodyBlock,
Value forOpIV, uint64_t unrollFactor,
924 annotateFn = defaultAnnotateFn;
933 for (
unsigned i = 1; i < unrollFactor; i++) {
937 operandMap.
map(iterArgs, lastYielded);
942 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
943 operandMap.
map(forOpIV, ivUnroll);
947 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++) {
949 annotateFn(i, clonedOp, builder);
956 for (
unsigned i = 0, e = lastYielded.size(); i < e; i++) {
957 Operation *defOp = yieldedValues[i].getDefiningOp();
958 if (defOp && defOp->
getBlock() == loopBodyBlock)
959 lastYielded[i] = operandMap.
lookup(yieldedValues[i]);
965 for (
auto it = loopBodyBlock->
begin(); it != std::next(srcBlockEnd); it++)
966 annotateFn(0, &*it, builder);
975 uint64_t unrollFactor) {
978 auto cleanupForOp = cast<AffineForOp>(builder.
clone(*forOp));
982 auto results = forOp.getResults();
983 auto cleanupResults = cleanupForOp.getResults();
984 auto cleanupIterOperands = cleanupForOp.getInits();
986 for (
auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
987 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
988 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
997 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
1003 forOp.setUpperBound(cleanupOperands, cleanupMap);
1010 AffineForOp forOp, uint64_t unrollFactor,
1012 bool cleanUpUnroll) {
1013 assert(unrollFactor > 0 &&
"unroll factor should be positive");
1016 if (unrollFactor == 1) {
1017 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1024 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1028 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1029 if (cleanUpUnroll) {
1043 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1044 forOp.getUpperBoundMap().getNumResults() != 1)
1050 assert(
false &&
"cleanup loop lower bound map for single result lower "
1051 "and upper bound maps can always be determined");
1054 ValueRange iterArgs(forOp.getRegionIterArgs());
1055 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1058 int64_t step = forOp.getStepAsInt();
1059 forOp.setStep(step * unrollFactor);
1061 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1064 auto d0 = b.getAffineDimExpr(0);
1065 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1066 return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1069 iterArgs, yieldedValues);
1077 uint64_t unrollJamFactor) {
1079 if (mayBeConstantTripCount.has_value() &&
1080 *mayBeConstantTripCount < unrollJamFactor)
1088 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1089 for (
auto controlOperand : aForOp.getControlOperands()) {
1090 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1091 return WalkResult::interrupt();
1095 return !walkResult.wasInterrupted();
1100 uint64_t unrollJamFactor) {
1101 assert(unrollJamFactor > 0 &&
"unroll jam factor should be positive");
1104 if (unrollJamFactor == 1) {
1105 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1112 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1116 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1117 LLVM_DEBUG(llvm::dbgs() <<
"[failed] trip count < unroll-jam factor\n");
1133 forOp.walk([&](AffineForOp aForOp) {
1134 if (aForOp.getNumIterOperands() > 0)
1135 loopsWithIterArgs.push_back(aForOp);
1140 if (forOp.getNumIterOperands() > 0)
1150 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1151 forOp.getUpperBoundMap().getNumResults() != 1)
1154 assert(
false &&
"cleanup loop lower bound map for single result lower "
1155 "and upper bound maps can always be determined");
1167 for (AffineForOp oldForOp : loopsWithIterArgs) {
1169 ValueRange oldIterOperands = oldForOp.getInits();
1170 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1172 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1175 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1176 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1177 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1181 bool forOpReplaced = oldForOp == forOp;
1182 AffineForOp newForOp =
1183 cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1184 rewriter, dupIterOperands,
false,
1186 return dupYieldOperands;
1188 newLoopsWithIterArgs.push_back(newForOp);
1193 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1194 unsigned oldNumIterArgs = oldIterArgs.size();
1195 ValueRange newResults = newForOp.getResults();
1196 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1197 assert(oldNumIterArgs == oldNumResults &&
1198 "oldNumIterArgs must be the same as oldNumResults");
1199 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1200 for (
unsigned j = 0;
j < oldNumIterArgs; ++
j) {
1204 operandMaps[i - 1].map(newIterArgs[
j],
1205 newIterArgs[i * oldNumIterArgs +
j]);
1206 operandMaps[i - 1].map(newResults[
j],
1207 newResults[i * oldNumResults +
j]);
1213 int64_t step = forOp.getStepAsInt();
1214 forOp.setStep(step * unrollJamFactor);
1216 auto forOpIV = forOp.getInductionVar();
1218 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1219 for (
auto &subBlock : subBlocks) {
1222 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1231 builder.
create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1232 operandMaps[i - 1].map(forOpIV, ivUnroll);
1235 for (
auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1236 builder.
clone(*it, operandMaps[i - 1]);
1239 for (
auto newForOp : newLoopsWithIterArgs) {
1240 unsigned oldNumIterOperands =
1241 newForOp.getNumIterOperands() / unrollJamFactor;
1242 unsigned numControlOperands = newForOp.getNumControlOperands();
1243 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1244 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1245 assert(oldNumIterOperands == oldNumYieldOperands &&
1246 "oldNumIterOperands must be the same as oldNumYieldOperands");
1247 for (
unsigned j = 0;
j < oldNumIterOperands; ++
j) {
1251 newForOp.setOperand(numControlOperands + i * oldNumIterOperands +
j,
1252 operandMaps[i - 1].lookupOrDefault(
1253 newForOp.getOperand(numControlOperands +
j)));
1255 i * oldNumYieldOperands +
j,
1256 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(
j)));
1260 if (forOp.getNumResults() > 0) {
1266 auto loc = forOp.getLoc();
1267 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1269 unsigned pos = reduction.iterArgPosition;
1270 Value lhs = forOp.getResult(pos);
1273 for (
unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1274 rhs = forOp.getResult(i * oldNumResults + pos);
1280 assert(op &&
"Reduction op should have been created");
1284 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1296 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1297 auto &forOpABody = forOpA.getBody()->getOperations();
1298 auto &forOpBBody = forOpB.getBody()->getOperations();
1304 forOpABody, forOpABody.begin(),
1305 std::prev(forOpABody.end()));
1308 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1309 std::prev(forOpBBody.end()));
1311 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1322 unsigned maxLoopDepth = loops.size();
1324 loopPermMapInv.resize(maxLoopDepth);
1325 for (
unsigned i = 0; i < maxLoopDepth; ++i)
1326 loopPermMapInv[loopPermMap[i]] = i;
1333 for (
const auto &depComps : depCompsVec) {
1334 assert(depComps.size() >= maxLoopDepth);
1337 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1338 unsigned permIndex = loopPermMapInv[
j];
1339 assert(depComps[permIndex].lb);
1340 int64_t depCompLb = *depComps[permIndex].lb;
1356 assert(loopPermMap.size() == loops.size());
1357 unsigned maxLoopDepth = loops.size();
1358 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1365 bool LLVM_ATTRIBUTE_UNUSED
1367 assert(!loops.empty() &&
"no loops provided");
1370 auto hasTwoElements = [](
Block *block) {
1371 auto secondOpIt = std::next(block->begin());
1372 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1375 auto enclosingLoop = loops.front();
1376 for (
auto loop : loops.drop_front()) {
1377 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1379 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1381 enclosingLoop = loop;
1390 assert(input.size() == permMap.size() &&
"invalid permutation map size");
1394 llvm::sort(checkPermMap);
1396 [](
const auto &en) {
return en.value() != en.index(); }))
1397 assert(
false &&
"invalid permutation map");
1400 if (input.size() < 2)
1408 for (
unsigned i = 0, e = input.size(); i < e; ++i)
1409 invPermMap.push_back({permMap[i], i});
1410 llvm::sort(invPermMap);
1414 if (permMap.back() != input.size() - 1) {
1415 Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1416 Block *srcBody = ((AffineForOp)input.back()).getBody();
1419 std::prev(srcBody->
end()));
1424 for (
int i = input.size() - 1; i >= 0; --i) {
1427 if (permMap[i] == 0) {
1432 auto *parentBlock = input[0]->getBlock();
1434 input[i]->getBlock()->getOperations(),
1441 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1442 if (i > 0 &&
static_cast<unsigned>(i - 1) == parentPosInInput)
1446 auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1447 destBody->getOperations().splice(destBody->begin(),
1448 input[i]->getBlock()->getOperations(),
1452 return invPermMap[0].second;
1461 if (loops.size() < 2)
1466 unsigned maxLoopDepth = loops.size();
1467 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1472 for (
auto &depComps : depCompsVec) {
1473 assert(depComps.size() >= maxLoopDepth);
1474 for (
unsigned j = 0;
j < maxLoopDepth; ++
j) {
1476 assert(depComp.
lb.has_value() && depComp.
ub.has_value());
1477 if (*depComp.
lb != 0 || *depComp.
ub != 0)
1487 unsigned nextSequentialLoop = numParallelLoops;
1488 unsigned nextParallelLoop = 0;
1489 for (
unsigned i = 0; i < maxLoopDepth; ++i) {
1491 loopPermMap[i] = nextParallelLoop++;
1493 loopPermMap[i] = nextSequentialLoop++;
1501 unsigned loopNestRootIndex =
permuteLoops(loops, loopPermMap);
1502 return loops[loopNestRootIndex];
1516 int64_t offset = 0) {
1517 auto bounds = llvm::to_vector<4>(map->
getResults());
1519 operands->insert(operands->begin() + map->
getNumDims(), iv);
1536 auto originalStep = forOp.getStepAsInt();
1537 auto scaledStep = originalStep * factor;
1538 forOp.setStep(scaledStep);
1543 auto lbMap = forOp.getLowerBoundMap();
1548 auto ubMap = forOp.getUpperBoundMap();
1553 auto iv = forOp.getInductionVar();
1555 for (
auto t : targets) {
1558 auto newForOp = b.
create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1559 ubOperands, ubMap, originalStep);
1560 auto begin = t.getBody()->begin();
1562 auto nOps = t.getBody()->getOperations().size() - 2;
1563 newForOp.getBody()->getOperations().splice(
1564 newForOp.getBody()->getOperations().begin(),
1565 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1567 newForOp.getRegion());
1568 innerLoops.push_back(newForOp);
1576 template <
typename SizeType>
1578 AffineForOp target) {
1584 assert(res.size() == 1 &&
"Expected 1 inner forOp");
1593 for (
auto it : llvm::zip(forOps, sizes)) {
1594 auto step =
stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1595 res.push_back(step);
1596 currentTargets = step;
1603 AffineForOp target) {
1606 assert(loops.size() == 1);
1607 res.push_back(loops[0]);
1613 if (loops.size() < 2)
1616 AffineForOp innermost = loops.back();
1617 AffineForOp outermost = loops.front();
1622 for (AffineForOp loop : loops) {
1624 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1625 loop.getConstantLowerBound() != 0)
1634 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1635 prev = builder.
create<AffineMinOp>(loc, origUbMap, ubOperands);
1637 prev = builder.
create<AffineApplyOp>(loc, origUbMap, ubOperands);
1638 upperBoundSymbols.push_back(prev);
1642 for (AffineForOp loop : loops.drop_front()) {
1643 ub = loop.getUpperBound();
1648 if (!llvm::hasSingleElement(origUbMap.
getResults()))
1649 upperBound = builder.
create<AffineMinOp>(loc, origUbMap, ubOperands);
1651 upperBound = builder.
create<AffineApplyOp>(loc, origUbMap, ubOperands);
1652 upperBoundSymbols.push_back(upperBound);
1654 operands.push_back(prev);
1655 operands.push_back(upperBound);
1657 prev = builder.
create<AffineApplyOp>(
1669 outermost.setUpperBound(prev, newUbMap);
1681 Value previous = outermost.getInductionVar();
1682 for (
unsigned idx = loops.size(); idx > 0; --idx) {
1683 if (idx != loops.size()) {
1685 operands.push_back(previous);
1686 operands.push_back(upperBoundSymbols[idx]);
1687 previous = builder.
create<AffineApplyOp>(
1697 Value inductionVariable;
1699 inductionVariable = previous;
1702 applyOperands.push_back(previous);
1703 applyOperands.push_back(upperBoundSymbols[idx - 1]);
1704 inductionVariable = builder.
create<AffineApplyOp>(
1712 inductionVariable, loops.back().getRegion());
1717 AffineForOp secondOutermostLoop = loops[1];
1718 innermost.getBody()->back().erase();
1719 outermost.getBody()->getOperations().splice(
1721 innermost.getBody()->getOperations());
1722 secondOutermostLoop.erase();
1729 assert(processorId.size() == numProcessors.size());
1730 if (processorId.empty())
1740 Value linearIndex = processorId.front();
1741 for (
unsigned i = 1, e = processorId.size(); i < e; ++i) {
1742 auto mulApplyOp = b.
create<AffineApplyOp>(
1743 loc, mulMap,
ValueRange{linearIndex, numProcessors[i]});
1744 linearIndex = b.
create<AffineApplyOp>(
1745 loc, addMap,
ValueRange{mulApplyOp, processorId[i]});
1748 auto mulApplyOp = b.
create<AffineApplyOp>(
1749 loc, mulMap,
ValueRange{linearIndex, forOp.getStep()});
1751 loc, addMap,
ValueRange{mulApplyOp, forOp.getLowerBound()});
1752 forOp.setLowerBound(lb);
1754 Value step = forOp.getStep();
1755 for (
auto numProcs : numProcessors)
1757 forOp.setStep(step);
1768 Block **copyPlacementBlock,
1773 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1779 auto it = enclosingFors.rbegin();
1780 for (
auto e = enclosingFors.rend(); it != e; ++it) {
1783 if (llvm::is_contained(symbols, it->getInductionVar()))
1787 if (it != enclosingFors.rbegin()) {
1788 auto lastInvariantIV = *std::prev(it);
1789 *copyInPlacementStart =
Block::iterator(lastInvariantIV.getOperation());
1790 *copyOutPlacementStart = std::next(*copyInPlacementStart);
1791 *copyPlacementBlock = lastInvariantIV->getBlock();
1793 *copyInPlacementStart = begin;
1794 *copyOutPlacementStart = end;
1795 *copyPlacementBlock = █
1813 if (bufferShape.size() <= 1)
1816 int64_t numEltPerStride = 1;
1818 for (
int d = bufferShape.size() - 1; d >= 1; d--) {
1819 int64_t dimSize = cast<MemRefType>(region.
memref.
getType()).getDimSize(d);
1821 numEltPerStride *= bufferShape[d];
1825 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1826 strideInfos->push_back({stride, numEltPerStride});
1851 assert(llvm::all_of(lbMaps, [&](
AffineMap lbMap) {
1854 assert(llvm::all_of(ubMaps, [&](
AffineMap ubMap) {
1858 unsigned rank = cast<MemRefType>(memref.
getType()).getRank();
1859 assert(lbMaps.size() == rank &&
"wrong number of lb maps");
1860 assert(ubMaps.size() == rank &&
"wrong number of ub maps");
1865 AffineForOp copyNestRoot;
1867 for (
unsigned d = 0; d < rank; ++d) {
1869 ubOperands, ubMaps[d]);
1871 copyNestRoot = forOp;
1875 auto fastBufOffsetMap =
1877 auto offset = b.
create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1883 fastBufMapOperands.push_back(offset);
1884 fastBufMapOperands.push_back(forOp.getInductionVar());
1885 mayBeDeadApplys.push_back(offset);
1888 memIndices.push_back(forOp.getInductionVar());
1898 for (
auto applyOp : mayBeDeadApplys)
1899 if (applyOp.use_empty())
1904 auto load = b.
create<AffineLoadOp>(loc, memref, memIndices);
1905 b.
create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1906 fastBufMapOperands);
1907 return copyNestRoot;
1912 b.
create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1913 b.
create<AffineStoreOp>(loc, load, memref, memIndices);
1914 return copyNestRoot;
1945 auto f = begin->getParentOfType<FunctionOpInterface>();
1946 OpBuilder topBuilder(f.getFunctionBody());
1947 Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1956 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1959 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1961 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1969 auto loc = region.
loc;
1970 auto memref = region.
memref;
1971 auto memRefType = cast<MemRefType>(memref.getType());
1973 if (!memRefType.getLayout().isIdentity()) {
1974 LLVM_DEBUG(llvm::dbgs() <<
"Non-identity layout map not yet supported\n");
1984 unsigned rank = memRefType.getRank();
1988 std::vector<SmallVector<int64_t, 4>> lbs;
1992 &fastBufferShape, &lbs, &lbDivisors);
1994 LLVM_DEBUG(llvm::dbgs() <<
"Non-constant region size not supported\n");
1998 if (*numElements == 0) {
1999 LLVM_DEBUG(llvm::dbgs() <<
"Nothing to copy\n");
2004 for (
unsigned i = 0; i < rank; ++i)
2021 fastBufOffsets.reserve(rank);
2022 for (
unsigned d = 0; d < rank; d++) {
2023 assert(lbs[d].size() == cst->
getNumCols() - rank &&
"incorrect bound size");
2025 AffineExpr offset = top.getAffineConstantExpr(0);
2026 for (
unsigned j = 0, e = cst->
getNumCols() - rank - 1;
j < e;
j++)
2027 offset = offset + lbs[d][
j] * top.getAffineDimExpr(
j);
2028 assert(lbDivisors[d] > 0);
2030 (offset + lbs[d][cst->
getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
2034 if (
auto caf = dyn_cast<AffineConstantExpr>(offset)) {
2035 auto indexVal = caf.getValue();
2036 if (indexVal == 0) {
2037 memIndices.push_back(zeroIndex);
2039 memIndices.push_back(
2040 top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2047 memIndices.push_back(b.
create<AffineApplyOp>(loc, map, regionSymbols));
2050 bufIndices.push_back(zeroIndex);
2054 fastBufOffsets.push_back(offset);
2061 bool existingBuf = fastBufferMap.count(memref) > 0;
2064 auto fastMemRefType =
2071 prologue.
create<memref::AllocOp>(loc, fastMemRefType).getResult();
2073 fastBufferMap[memref] = fastMemRef;
2077 *sizeInBytes = maySizeInBytes.value_or(0);
2080 <<
"Creating fast buffer of type " << fastMemRefType
2085 fastMemRef = fastBufferMap[memref];
2088 auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2091 Value numEltPerDmaStride;
2098 if (dmaStrideInfos.size() > 1) {
2099 LLVM_DEBUG(llvm::dbgs() <<
"Only up to one level of stride supported\n");
2103 if (!dmaStrideInfos.empty()) {
2105 top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2106 numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2107 loc, dmaStrideInfos[0].numEltPerStride);
2115 auto postDomFilter = std::prev(end);
2127 regionSymbols, ubMaps,
2128 regionSymbols, fastBufOffsets,
2132 copyNests.insert(copyNest);
2136 if (region.
isWrite() && isCopyOutAtEndOfBlock)
2143 auto tagMemRef = prologue.
create<memref::AllocOp>(loc, tagMemRefType);
2151 fastMemRef, bufAffineMap, bufIndices,
2152 tagMemRef, tagAffineMap, tagIndices,
2153 numElementsSSA, dmaStride, numEltPerDmaStride);
2157 loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2158 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2159 dmaStride, numEltPerDmaStride);
2162 if (isCopyOutAtEndOfBlock)
2171 auto tagDeallocOp = epilogue.
create<memref::DeallocOp>(loc, tagMemRef);
2172 if (*nEnd == end && isCopyOutAtEndOfBlock)
2180 auto bufDeallocOp = epilogue.
create<memref::DeallocOp>(loc, fastMemRef);
2183 if (!copyOptions.
generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2195 remapExprs.reserve(rank);
2196 for (
unsigned i = 0; i < rank; i++) {
2201 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2203 auto indexRemap =
AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2208 bool isBeginAtStartOfBlock = (begin == block->
begin());
2209 if (!isBeginAtStartOfBlock)
2210 prevOfBegin = std::prev(begin);
2220 *nBegin = isBeginAtStartOfBlock ? block->
begin() : std::next(prevOfBegin);
2232 if (
auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2233 rank = loadOp.getMemRefType().getRank();
2234 region->
memref = loadOp.getMemRef();
2236 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2237 rank = storeOp.getMemRefType().getRank();
2238 region->
memref = storeOp.getMemRef();
2241 assert(
false &&
"expected load or store op");
2244 auto memRefType = cast<MemRefType>(region->
memref.
getType());
2245 if (!memRefType.hasStaticShape())
2254 ivs.resize(numParamLoopIVs);
2258 regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2261 for (
unsigned d = 0; d < rank; d++) {
2262 auto dimSize = memRefType.getDimSize(d);
2263 assert(dimSize > 0 &&
"filtered dynamic shapes above");
2264 regionCst->addBound(BoundType::LB, d, 0);
2265 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2273 std::optional<Value> filterMemRef,
2278 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2279 "Inconsistent block begin/end args");
2280 assert(end != end->getBlock()->end() &&
"end can't be the block terminator");
2282 Block *block = begin->getBlock();
2288 LLVM_DEBUG(llvm::dbgs() <<
"Generating copies at depth " << copyDepth
2290 LLVM_DEBUG(llvm::dbgs() <<
"from begin: " << *begin <<
"\n");
2291 LLVM_DEBUG(llvm::dbgs() <<
"to inclusive end: " << *std::prev(end) <<
"\n");
2296 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2297 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2309 MemRefType memrefType;
2311 if (
auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2312 memref = loadOp.getMemRef();
2313 memrefType = loadOp.getMemRefType();
2314 }
else if (
auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2315 memref = storeOp.getMemRef();
2316 memrefType = storeOp.getMemRefType();
2322 auto memorySpaceAttr =
2323 dyn_cast_or_null<IntegerAttr>(memrefType.getMemorySpace());
2324 if ((filterMemRef.has_value() && filterMemRef != memref) ||
2330 auto region = std::make_unique<MemRefRegion>(opInst->
getLoc());
2331 if (failed(region->compute(opInst, copyDepth,
nullptr,
2333 LLVM_DEBUG(llvm::dbgs()
2334 <<
"Error obtaining memory region: semi-affine maps?\n");
2335 LLVM_DEBUG(llvm::dbgs() <<
"over-approximating to the entire memref\n");
2338 opInst->
emitError(
"non-constant memref sizes not yet supported"));
2359 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2361 const auto *
const it = targetRegions.find(region->memref);
2362 if (it == targetRegions.end())
2366 if (failed(it->second->unionBoundingBox(*region))) {
2367 LLVM_DEBUG(llvm::dbgs()
2368 <<
"Memory region bounding box failed; "
2369 "over-approximating to the entire memref\n");
2373 "non-constant memref sizes not yet supported"));
2377 it->second->getConstraints()->clearAndCopyFrom(
2378 *region->getConstraints());
2381 region->getConstraints()->clearAndCopyFrom(
2382 *it->second->getConstraints());
2387 bool existsInRead = updateRegion(readRegions);
2390 bool existsInWrite = updateRegion(writeRegions);
2395 if (region->isWrite() && !existsInWrite) {
2396 writeRegions[region->memref] = std::move(region);
2397 }
else if (!region->isWrite() && !existsInRead) {
2398 readRegions[region->memref] = std::move(region);
2403 LLVM_DEBUG(begin->emitError(
2404 "copy generation failed for one or more memref's in this block\n"));
2408 uint64_t totalCopyBuffersSizeInBytes = 0;
2410 auto processRegions =
2411 [&](
const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2413 for (
const auto ®ionEntry : regions) {
2417 Block *copyPlacementBlock;
2419 *regionEntry.second, *block, begin, end, ©PlacementBlock,
2420 ©InPlacementStart, ©OutPlacementStart);
2422 uint64_t sizeInBytes;
2425 *regionEntry.second, block, begin, end, copyPlacementBlock,
2426 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2427 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2428 if (succeeded(iRet)) {
2432 totalCopyBuffersSizeInBytes += sizeInBytes;
2434 ret = ret & succeeded(iRet);
2437 processRegions(readRegions);
2438 processRegions(writeRegions);
2441 LLVM_DEBUG(begin->emitError(
2442 "copy generation failed for one or more memref's in this block\n"));
2448 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2449 LLVM_DEBUG(forOp.emitRemark()
2451 <<
" KiB of copy buffers in fast memory space for this block");
2456 "total size of all copy buffers' for this block exceeds fast memory "
2469 std::prev(forOp.getBody()->end()), copyOptions,
2470 filterMemRef, copyNests);
2477 auto begin = analyzedOp->getIterator();
2478 auto end = std::next(begin);
2482 auto err =
generateCopy(memrefRegion, block, begin, end, block, begin, end,
2483 copyOptions, fastBufferMap, copyNests,
2488 const auto &en = fastBufferMap.find(memrefRegion.
memref);
2490 if (en == fastBufferMap.end())
2492 result.
alloc = en->second.getDefiningOp();
2493 assert(result.
alloc &&
"fast buffer expected to be locally allocated");
2494 assert(copyNests.size() <= 1 &&
"At most one copy nest is expected.");
2495 result.
copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2504 assert(currLoopDepth <= depthToLoops.size() &&
"Unexpected currLoopDepth");
2505 if (currLoopDepth == depthToLoops.size())
2506 depthToLoops.emplace_back();
2508 for (
auto &op : *block) {
2509 if (
auto forOp = dyn_cast<AffineForOp>(op)) {
2510 depthToLoops[currLoopDepth].push_back(forOp);
2519 for (
auto &block : func)
2523 if (!depthToLoops.empty()) {
2524 assert(depthToLoops.back().empty() &&
"Last loop level is not empty?");
2525 depthToLoops.pop_back();
2542 return b.
create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2556 auto *context = loops[0].getContext();
2560 llvm::append_range(ops, loops);
2570 for (
auto loop : loops) {
2573 assert(loop.getStepAsInt() == 1 &&
"point loop step expected to be one");
2577 unsigned fullTileLbPos, fullTileUbPos;
2580 nullptr, &fullTileLbPos,
2582 LLVM_DEBUG(llvm::dbgs() <<
"Can't get constant diff pair for a loop\n");
2591 fullTileLb.assign(fLb.begin(), fLb.end());
2592 fullTileUb.assign(fUb.begin(), fUb.end());
2595 for (
auto lbIndex : lbIndices)
2596 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2597 cst.
atIneq(lbIndex, i) = fullTileLb[i] - cst.
atIneq(lbIndex, i);
2600 for (
auto ubIndex : ubIndices)
2601 for (
unsigned i = 0, e = cst.
getNumCols(); i < e; ++i)
2602 cst.
atIneq(ubIndex, i) -= fullTileUb[i];
2625 return b.
create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2630 static LogicalResult
2633 fullTileLoops.reserve(inputNest.size());
2638 for (
auto loop : inputNest) {
2640 if (loop.getStepAsInt() != 1) {
2641 LLVM_DEBUG(llvm::dbgs()
2642 <<
"[tile separation] non-unit stride not implemented\n");
2650 unsigned lbPos, ubPos;
2653 nullptr, &lbPos, &ubPos) ||
2655 LLVM_DEBUG(llvm::dbgs() <<
"[tile separation] Can't get constant diff / "
2656 "equalities not yet handled\n");
2671 fullTileLoops.push_back(fullTileLoop);
2677 operandMap.
map(loopEn.value().getInductionVar(),
2678 fullTileLoops[loopEn.index()].getInductionVar());
2680 for (
auto &op : inputNest.back().getBody()->without_terminator())
2681 b.
clone(op, operandMap);
2688 if (inputNest.empty())
2691 auto firstLoop = inputNest[0];
2694 auto prevLoop = firstLoop;
2695 for (
auto loop : inputNest.drop_front(1)) {
2696 assert(loop->getParentOp() == prevLoop &&
"input not contiguously nested");
2704 if (!fullTileLoops.empty())
2705 fullTileLoops.front().erase();
2713 fullTileLoops.front().erase();
2714 LLVM_DEBUG(llvm::dbgs() <<
"All tiles are full tiles, or failure creating "
2715 "separation condition\n");
2720 Block *thenBlock = ifOp.getThenBlock();
2721 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2723 std::prev(thenBlock->
end()),
2724 outermostFullTileLoop->getBlock()->getOperations(),
2729 Block *elseBlock = ifOp.getElseBlock();
2731 firstLoop->getBlock()->getOperations(),
2735 *fullTileNest = std::move(fullTileLoops);
2741 LogicalResult result(failure());
2744 if (loops.size() <= 1)
2752 for (
unsigned i = 0, e = loops.size(); i < e; ++i) {
2753 operandsDefinedAbove[i] = i;
2754 for (
unsigned j = 0;
j < i; ++
j) {
2756 operandsDefinedAbove[i] =
j;
2765 for (
unsigned end = loops.size(); end > 0; --end) {
2767 for (; start < end - 1; ++start) {
2769 *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2770 std::next(operandsDefinedAbove.begin(), end));
2773 assert(maxPos == start &&
2774 "expected loop bounds to be known at the start of the band");
2782 if (start != end - 1)
2791 while (
auto loopOp = currentOp->
getParentOfType<LoopLikeOpInterface>()) {
2792 if (!loopOp.isDefinedOutsideOfLoop(operand.
get()))
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
static void findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
static LogicalResult generateCopy(const MemRefRegion ®ion, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions ©Options, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > ©Nests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region; generates a copy from th...
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
static void getMultiLevelStrides(const MemRefRegion ®ion, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to ‘memref’ to/from ‘fastMemRef’ and returns the outermost AffineFor...
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
unsigned getNumDims() const
ArrayRef< AffineExpr > getResults() const
unsigned getNumResults() const
unsigned getNumInputs() const
AffineExpr getResult(unsigned idx) const
Block represents an ordered list of Operations.
OpListType::iterator iterator
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Operation * getTerminator()
Get the terminator operation of this block.
OpListType & getOperations()
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
AffineMap getDimIdentityMap()
AffineMap getMultiDimIdentityMap(unsigned rank)
AffineExpr getAffineSymbolExpr(unsigned position)
AffineExpr getAffineConstantExpr(int64_t constant)
AffineExpr getAffineDimExpr(unsigned position)
MLIRContext * getContext() const
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
IRValueT get() const
Return the current value being used by this operand.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class represents a diagnostic that is inflight and set to be reported.
An integer set representing a conjunction of one or more affine equalities and inequalities.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents an operand of an operation.
Operation * getOperation()
Inherit getOperation from OpState.
This class implements the operand iterators for the Operation class.
Operation is the basic unit of execution within MLIR.
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Location getLoc()
The source location the operation was defined or derived from.
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Block * getBlock()
Returns the operation block that contains this operation.
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
bool use_empty() const
Returns true if this value has no uses.
Type getType() const
Return the type of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
static WalkResult advance()
AffineBound represents a lower or upper bound in the for operation.
Value getOperand(unsigned idx)
operand_range getOperands()
unsigned getNumOperands()
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
AffineMap getAffineMap() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos, AffineValueMap &vmap, MLIRContext *context) const
Returns the bound for the variable at pos from the inequality at ineqPos as a 1-d affine value map (a...
Specialization of arith.constant op that returns an integer of index type.
Operation * getOwner() const
Return the owner of this operand.
std::optional< DynamicAPInt > getConstantBoundOnDimSize(unsigned pos, SmallVectorImpl< DynamicAPInt > *lb=nullptr, DynamicAPInt *boundFloorDivisor=nullptr, SmallVectorImpl< DynamicAPInt > *ub=nullptr, unsigned *minLbPos=nullptr, unsigned *minUbPos=nullptr) const
Returns the smallest known constant bound for the extent of the specified variable (pos^th),...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
unsigned getNumSymbolVars() const
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
unsigned getNumVars() const
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumDimAndSymbolVars() const
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
unsigned getNumDimVars() const
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, std::optional< Value > filterMemRef, DenseSet< Operation * > ©Nests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions ©Options, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Count the number of loops surrounding operand such that operand could be hoisted above.
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
AffineForOp sinkSequentialLoops(AffineForOp forOp)
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region ®ion)
Replace all uses of orig within the given region with replacement.
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
const FrozenRewritePatternSet GreedyRewriteConfig config
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
@ ExistingOps
Only pre-existing ops are processed.
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
uint64_t fastMemCapacityBytes
Result for calling generateCopyForMemRegion.
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
FlatAffineValueConstraints * getConstraints()
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, std::vector< SmallVector< int64_t, 4 >> *lbs=nullptr, SmallVectorImpl< int64_t > *lbDivisors=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Value memref
Memref that this region corresponds to.
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.