doxygen/Dialect_2SCF_2Utils_2Utils_8cpp_source.html

 //===- Utils.cpp ---- Misc utilities for loop transformation ----------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements miscellaneous loop transformation routines.

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/SCF/Utils/Utils.h"

 #include "mlir/Analysis/SliceAnalysis.h"

 #include "mlir/Dialect/Affine/IR/AffineOps.h"

 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/Arith/Utils/Utils.h"

 #include "mlir/Dialect/Func/IR/FuncOps.h"

 #include "mlir/Dialect/SCF/IR/SCF.h"

 #include "mlir/IR/IRMapping.h"

 #include "mlir/IR/OpDefinition.h"

 #include "mlir/IR/PatternMatch.h"

 #include "mlir/Interfaces/SideEffectInterfaces.h"

 #include "mlir/Transforms/RegionUtils.h"

 #include "llvm/ADT/STLExtras.h"

 #include "llvm/ADT/SmallVector.h"

 #include "llvm/Support/DebugLog.h"

 #include <cstdint>


 using namespace mlir;


 #define DEBUG_TYPE "scf-utils"


 SmallVector<scf::ForOp> mlir::replaceLoopNestWithNewYields(

     RewriterBase &rewriter, MutableArrayRef<scf::ForOp> loopNest,

     ValueRange newIterOperands, const NewYieldValuesFn &newYieldValuesFn,

     bool replaceIterOperandsUsesInLoop) {

   if (loopNest.empty())

     return {};

   // This method is recursive (to make it more readable). Adding an

   // assertion here to limit the recursion. (See

   // https://discourse.llvm.org/t/rfc-update-to-mlir-developer-policy-on-recursion/62235)

   assert(loopNest.size() <= 10 &&

          "exceeded recursion limit when yielding value from loop nest");


   // To yield a value from a perfectly nested loop nest, the following

   // pattern needs to be created, i.e. starting with

   //

   // ```mlir

   //  scf.for .. {

   //    scf.for .. {

   //      scf.for .. {

   //        %value = ...

   //      }

   //    }

   //  }

   // ```

   //

   // needs to be modified to

   //

   // ```mlir

   // %0 = scf.for .. iter_args(%arg0 = %init) {

   //   %1 = scf.for .. iter_args(%arg1 = %arg0) {

   //     %2 = scf.for .. iter_args(%arg2 = %arg1) {

   //       %value = ...

   //       scf.yield %value

   //     }

   //     scf.yield %2

   //   }

   //   scf.yield %1

   // }

   // ```

   //

   // The inner most loop is handled using the `replaceWithAdditionalYields`

   // that works on a single loop.

   if (loopNest.size() == 1) {

     auto innerMostLoop =

         cast<scf::ForOp>(*loopNest.back().replaceWithAdditionalYields(

             rewriter, newIterOperands, replaceIterOperandsUsesInLoop,

             newYieldValuesFn));

     return {innerMostLoop};

   }

   // The outer loops are modified by calling this method recursively

   // - The return value of the inner loop is the value yielded by this loop.

   // - The region iter args of this loop are the init_args for the inner loop.

   SmallVector<scf::ForOp> newLoopNest;

   NewYieldValuesFn fn =

       [&](OpBuilder &innerBuilder, Location loc,

           ArrayRef<BlockArgument> innerNewBBArgs) -> SmallVector<Value> {

     newLoopNest = replaceLoopNestWithNewYields(rewriter, loopNest.drop_front(),

                                                innerNewBBArgs, newYieldValuesFn,

                                                replaceIterOperandsUsesInLoop);

     return llvm::to_vector(llvm::map_range(

         newLoopNest.front().getResults().take_back(innerNewBBArgs.size()),

         [](OpResult r) -> Value { return r; }));

   };

   scf::ForOp outerMostLoop =

       cast<scf::ForOp>(*loopNest.front().replaceWithAdditionalYields(

           rewriter, newIterOperands, replaceIterOperandsUsesInLoop, fn));

   newLoopNest.insert(newLoopNest.begin(), outerMostLoop);

   return newLoopNest;

 }


 /// Outline a region with a single block into a new FuncOp.

 /// Assumes the FuncOp result types is the type of the yielded operands of the

 /// single block. This constraint makes it easy to determine the result.

 /// This method also clones the `arith::ConstantIndexOp` at the start of

 /// `outlinedFuncBody` to alloc simple canonicalizations. If `callOp` is

 /// provided, it will be set to point to the operation that calls the outlined

 /// function.

 // TODO: support more than single-block regions.

 // TODO: more flexible constant handling.

 FailureOr<func::FuncOp> mlir::outlineSingleBlockRegion(RewriterBase &rewriter,

                                                        Location loc,

                                                        Region &region,

                                                        StringRef funcName,

                                                        func::CallOp *callOp) {

   assert(!funcName.empty() && "funcName cannot be empty");

   if (!region.hasOneBlock())

     return failure();


   Block *originalBlock = &region.front();

   Operation *originalTerminator = originalBlock->getTerminator();


   // Outline before current function.

   OpBuilder::InsertionGuard g(rewriter);

   rewriter.setInsertionPoint(region.getParentOfType<FunctionOpInterface>());


   SetVector<Value> captures;

   getUsedValuesDefinedAbove(region, captures);


   ValueRange outlinedValues(captures.getArrayRef());

   SmallVector<Type> outlinedFuncArgTypes;

   SmallVector<Location> outlinedFuncArgLocs;

   // Region's arguments are exactly the first block's arguments as per

   // Region::getArguments().

   // Func's arguments are cat(regions's arguments, captures arguments).

   for (BlockArgument arg : region.getArguments()) {

     outlinedFuncArgTypes.push_back(arg.getType());

     outlinedFuncArgLocs.push_back(arg.getLoc());

   }

   for (Value value : outlinedValues) {

     outlinedFuncArgTypes.push_back(value.getType());

     outlinedFuncArgLocs.push_back(value.getLoc());

   }

   FunctionType outlinedFuncType =

       FunctionType::get(rewriter.getContext(), outlinedFuncArgTypes,

                         originalTerminator->getOperandTypes());

   auto outlinedFunc =

       func::FuncOp::create(rewriter, loc, funcName, outlinedFuncType);

   Block *outlinedFuncBody = outlinedFunc.addEntryBlock();


   // Merge blocks while replacing the original block operands.

   // Warning: `mergeBlocks` erases the original block, reconstruct it later.

   int64_t numOriginalBlockArguments = originalBlock->getNumArguments();

   auto outlinedFuncBlockArgs = outlinedFuncBody->getArguments();

   {

     OpBuilder::InsertionGuard g(rewriter);

     rewriter.setInsertionPointToEnd(outlinedFuncBody);

     rewriter.mergeBlocks(

         originalBlock, outlinedFuncBody,

         outlinedFuncBlockArgs.take_front(numOriginalBlockArguments));

     // Explicitly set up a new ReturnOp terminator.

     rewriter.setInsertionPointToEnd(outlinedFuncBody);

     func::ReturnOp::create(rewriter, loc, originalTerminator->getResultTypes(),

                            originalTerminator->getOperands());

   }


   // Reconstruct the block that was deleted and add a

   // terminator(call_results).

   Block *newBlock = rewriter.createBlock(

       &region, region.begin(),

       TypeRange{outlinedFuncArgTypes}.take_front(numOriginalBlockArguments),

       ArrayRef<Location>(outlinedFuncArgLocs)

           .take_front(numOriginalBlockArguments));

   {

     OpBuilder::InsertionGuard g(rewriter);

     rewriter.setInsertionPointToEnd(newBlock);

     SmallVector<Value> callValues;

     llvm::append_range(callValues, newBlock->getArguments());

     llvm::append_range(callValues, outlinedValues);

     auto call = func::CallOp::create(rewriter, loc, outlinedFunc, callValues);

     if (callOp)

       *callOp = call;


     // `originalTerminator` was moved to `outlinedFuncBody` and is still valid.

     // Clone `originalTerminator` to take the callOp results then erase it from

     // `outlinedFuncBody`.

     IRMapping bvm;

     bvm.map(originalTerminator->getOperands(), call->getResults());

     rewriter.clone(*originalTerminator, bvm);

     rewriter.eraseOp(originalTerminator);

   }


   // Lastly, explicit RAUW outlinedValues, only for uses within `outlinedFunc`.

   // Clone the `arith::ConstantIndexOp` at the start of `outlinedFuncBody`.

   for (auto it : llvm::zip(outlinedValues, outlinedFuncBlockArgs.take_back(

                                                outlinedValues.size()))) {

     Value orig = std::get<0>(it);

     Value repl = std::get<1>(it);

     {

       OpBuilder::InsertionGuard g(rewriter);

       rewriter.setInsertionPointToStart(outlinedFuncBody);

       if (Operation *cst = orig.getDefiningOp<arith::ConstantIndexOp>()) {

         IRMapping bvm;

         repl = rewriter.clone(*cst, bvm)->getResult(0);

       }

     }

     orig.replaceUsesWithIf(repl, [&](OpOperand &opOperand) {

       return outlinedFunc->isProperAncestor(opOperand.getOwner());

     });

   }


   return outlinedFunc;

 }


 LogicalResult mlir::outlineIfOp(RewriterBase &b, scf::IfOp ifOp,

                                 func::FuncOp *thenFn, StringRef thenFnName,

                                 func::FuncOp *elseFn, StringRef elseFnName) {

   IRRewriter rewriter(b);

   Location loc = ifOp.getLoc();

   FailureOr<func::FuncOp> outlinedFuncOpOrFailure;

   if (thenFn && !ifOp.getThenRegion().empty()) {

     outlinedFuncOpOrFailure = outlineSingleBlockRegion(

         rewriter, loc, ifOp.getThenRegion(), thenFnName);

     if (failed(outlinedFuncOpOrFailure))

       return failure();

     *thenFn = *outlinedFuncOpOrFailure;

   }

   if (elseFn && !ifOp.getElseRegion().empty()) {

     outlinedFuncOpOrFailure = outlineSingleBlockRegion(

         rewriter, loc, ifOp.getElseRegion(), elseFnName);

     if (failed(outlinedFuncOpOrFailure))

       return failure();

     *elseFn = *outlinedFuncOpOrFailure;

   }

   return success();

 }


 bool mlir::getInnermostParallelLoops(Operation *rootOp,

                                      SmallVectorImpl<scf::ParallelOp> &result) {

   assert(rootOp != nullptr && "Root operation must not be a nullptr.");

   bool rootEnclosesPloops = false;

   for (Region &region : rootOp->getRegions()) {

     for (Block &block : region.getBlocks()) {

       for (Operation &op : block) {

         bool enclosesPloops = getInnermostParallelLoops(&op, result);

         rootEnclosesPloops |= enclosesPloops;

         if (auto ploop = dyn_cast<scf::ParallelOp>(op)) {

           rootEnclosesPloops = true;


           // Collect parallel loop if it is an innermost one.

           if (!enclosesPloops)

             result.push_back(ploop);

         }

       }

     }

   }

   return rootEnclosesPloops;

 }


 // Build the IR that performs ceil division of a positive value by a constant:

 //    ceildiv(a, B) = divis(a + (B-1), B)

 // where divis is rounding-to-zero division.

 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,

                              int64_t divisor) {

   assert(divisor > 0 && "expected positive divisor");

   assert(dividend.getType().isIntOrIndex() &&

          "expected integer or index-typed value");


   Value divisorMinusOneCst = arith::ConstantOp::create(

       builder, loc, builder.getIntegerAttr(dividend.getType(), divisor - 1));

   Value divisorCst = arith::ConstantOp::create(

       builder, loc, builder.getIntegerAttr(dividend.getType(), divisor));

   Value sum = arith::AddIOp::create(builder, loc, dividend, divisorMinusOneCst);

   return arith::DivUIOp::create(builder, loc, sum, divisorCst);

 }


 // Build the IR that performs ceil division of a positive value by another

 // positive value:

 //    ceildiv(a, b) = divis(a + (b - 1), b)

 // where divis is rounding-to-zero division.

 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,

                              Value divisor) {

   assert(dividend.getType().isIntOrIndex() &&

          "expected integer or index-typed value");

   Value cstOne = arith::ConstantOp::create(

       builder, loc, builder.getOneAttr(dividend.getType()));

   Value divisorMinusOne = arith::SubIOp::create(builder, loc, divisor, cstOne);

   Value sum = arith::AddIOp::create(builder, loc, dividend, divisorMinusOne);

   return arith::DivUIOp::create(builder, loc, sum, divisor);

 }


 /// Returns the trip count of `forOp` if its' low bound, high bound and step are

 /// constants, or optional otherwise. Trip count is computed as

 /// ceilDiv(highBound - lowBound, step).

 static std::optional<int64_t> getConstantTripCount(scf::ForOp forOp) {

   return constantTripCount(forOp.getLowerBound(), forOp.getUpperBound(),

                            forOp.getStep());

 }


 /// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with

 /// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap

 /// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each

 /// unrolled iteration using annotateFn.

 static void generateUnrolledLoop(

     Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,

     function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,

     function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,

     ValueRange iterArgs, ValueRange yieldedValues) {

   // Builder to insert unrolled bodies just before the terminator of the body of

   // 'forOp'.

   auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);


   constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};

   if (!annotateFn)

     annotateFn = defaultAnnotateFn;


   // Keep a pointer to the last non-terminator operation in the original block

   // so that we know what to clone (since we are doing this in-place).

   Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);


   // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).

   SmallVector<Value, 4> lastYielded(yieldedValues);


   for (unsigned i = 1; i < unrollFactor; i++) {

     IRMapping operandMap;


     // Prepare operand map.

     operandMap.map(iterArgs, lastYielded);


     // If the induction variable is used, create a remapping to the value for

     // this unrolled instance.

     if (!forOpIV.use_empty()) {

       Value ivUnroll = ivRemapFn(i, forOpIV, builder);

       operandMap.map(forOpIV, ivUnroll);

     }


     // Clone the original body of 'forOp'.

     for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {

       Operation *clonedOp = builder.clone(*it, operandMap);

       annotateFn(i, clonedOp, builder);

     }


     // Update yielded values.

     for (unsigned i = 0, e = lastYielded.size(); i < e; i++)

       lastYielded[i] = operandMap.lookupOrDefault(yieldedValues[i]);

   }


   // Make sure we annotate the Ops in the original body. We do this last so that

   // any annotations are not copied into the cloned Ops above.

   for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)

     annotateFn(0, &*it, builder);


   // Update operands of the yield statement.

   loopBodyBlock->getTerminator()->setOperands(lastYielded);

 }


 /// Unrolls 'forOp' by 'unrollFactor', returns the unrolled main loop and the

 /// epilogue loop, if the loop is unrolled.

 FailureOr<UnrolledLoopInfo> mlir::loopUnrollByFactor(

     scf::ForOp forOp, uint64_t unrollFactor,

     function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {

   assert(unrollFactor > 0 && "expected positive unroll factor");


   // Return if the loop body is empty.

   if (llvm::hasSingleElement(forOp.getBody()->getOperations()))

     return UnrolledLoopInfo{forOp, std::nullopt};


   // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate

   // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.

   OpBuilder boundsBuilder(forOp);

   IRRewriter rewriter(forOp.getContext());

   auto loc = forOp.getLoc();

   Value step = forOp.getStep();

   Value upperBoundUnrolled;

   Value stepUnrolled;

   bool generateEpilogueLoop = true;


   std::optional<int64_t> constTripCount = getConstantTripCount(forOp);

   if (constTripCount) {

     // Constant loop bounds computation.

     int64_t lbCst = getConstantIntValue(forOp.getLowerBound()).value();

     int64_t ubCst = getConstantIntValue(forOp.getUpperBound()).value();

     int64_t stepCst = getConstantIntValue(forOp.getStep()).value();

     if (unrollFactor == 1) {

       if (*constTripCount == 1 &&

           failed(forOp.promoteIfSingleIteration(rewriter)))

         return failure();

       return UnrolledLoopInfo{forOp, std::nullopt};

     }


     int64_t tripCountEvenMultiple =

         *constTripCount - (*constTripCount % unrollFactor);

     int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;

     int64_t stepUnrolledCst = stepCst * unrollFactor;


     // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.

     generateEpilogueLoop = upperBoundUnrolledCst < ubCst;

     if (generateEpilogueLoop)

       upperBoundUnrolled = arith::ConstantOp::create(

           boundsBuilder, loc,

           boundsBuilder.getIntegerAttr(forOp.getUpperBound().getType(),

                                        upperBoundUnrolledCst));

     else

       upperBoundUnrolled = forOp.getUpperBound();


     // Create constant for 'stepUnrolled'.

     stepUnrolled =

         stepCst == stepUnrolledCst

             ? step

             : arith::ConstantOp::create(boundsBuilder, loc,

                                         boundsBuilder.getIntegerAttr(

                                             step.getType(), stepUnrolledCst));

   } else {

     // Dynamic loop bounds computation.

     // TODO: Add dynamic asserts for negative lb/ub/step, or

     // consider using ceilDiv from AffineApplyExpander.

     auto lowerBound = forOp.getLowerBound();

     auto upperBound = forOp.getUpperBound();

     Value diff =

         arith::SubIOp::create(boundsBuilder, loc, upperBound, lowerBound);

     Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);

     Value unrollFactorCst = arith::ConstantOp::create(

         boundsBuilder, loc,

         boundsBuilder.getIntegerAttr(tripCount.getType(), unrollFactor));

     Value tripCountRem =

         arith::RemSIOp::create(boundsBuilder, loc, tripCount, unrollFactorCst);

     // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)

     Value tripCountEvenMultiple =

         arith::SubIOp::create(boundsBuilder, loc, tripCount, tripCountRem);

     // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step

     upperBoundUnrolled = arith::AddIOp::create(

         boundsBuilder, loc, lowerBound,

         arith::MulIOp::create(boundsBuilder, loc, tripCountEvenMultiple, step));

     // Scale 'step' by 'unrollFactor'.

     stepUnrolled =

         arith::MulIOp::create(boundsBuilder, loc, step, unrollFactorCst);

   }


   UnrolledLoopInfo resultLoops;


   // Create epilogue clean up loop starting at 'upperBoundUnrolled'.

   if (generateEpilogueLoop) {

     OpBuilder epilogueBuilder(forOp->getContext());

     epilogueBuilder.setInsertionPointAfter(forOp);

     auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));

     epilogueForOp.setLowerBound(upperBoundUnrolled);


     // Update uses of loop results.

     auto results = forOp.getResults();

     auto epilogueResults = epilogueForOp.getResults();


     for (auto e : llvm::zip(results, epilogueResults)) {

       std::get<0>(e).replaceAllUsesWith(std::get<1>(e));

     }

     epilogueForOp->setOperands(epilogueForOp.getNumControlOperands(),

                                epilogueForOp.getInitArgs().size(), results);

     if (epilogueForOp.promoteIfSingleIteration(rewriter).failed())

       resultLoops.epilogueLoopOp = epilogueForOp;

   }


   // Create unrolled loop.

   forOp.setUpperBound(upperBoundUnrolled);

   forOp.setStep(stepUnrolled);


   auto iterArgs = ValueRange(forOp.getRegionIterArgs());

   auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();


   generateUnrolledLoop(

       forOp.getBody(), forOp.getInductionVar(), unrollFactor,

       [&](unsigned i, Value iv, OpBuilder b) {

         // iv' = iv + step * i;

         auto stride = arith::MulIOp::create(

             b, loc, step,

             arith::ConstantOp::create(b, loc,

                                       b.getIntegerAttr(iv.getType(), i)));

         return arith::AddIOp::create(b, loc, iv, stride);

       },

       annotateFn, iterArgs, yieldedValues);

   // Promote the loop body up if this has turned into a single iteration loop.

   if (forOp.promoteIfSingleIteration(rewriter).failed())

     resultLoops.mainLoopOp = forOp;

   return resultLoops;

 }


 /// Unrolls this loop completely.

 LogicalResult mlir::loopUnrollFull(scf::ForOp forOp) {

   IRRewriter rewriter(forOp.getContext());

   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);

   if (!mayBeConstantTripCount.has_value())

     return failure();

   uint64_t tripCount = *mayBeConstantTripCount;

   if (tripCount == 0)

     return success();

   if (tripCount == 1)

     return forOp.promoteIfSingleIteration(rewriter);

   return loopUnrollByFactor(forOp, tripCount);

 }


 /// Check if bounds of all inner loops are defined outside of `forOp`

 /// and return false if not.

 static bool areInnerBoundsInvariant(scf::ForOp forOp) {

   auto walkResult = forOp.walk([&](scf::ForOp innerForOp) {

     if (!forOp.isDefinedOutsideOfLoop(innerForOp.getLowerBound()) ||

         !forOp.isDefinedOutsideOfLoop(innerForOp.getUpperBound()) ||

         !forOp.isDefinedOutsideOfLoop(innerForOp.getStep()))

       return WalkResult::interrupt();


     return WalkResult::advance();

   });

   return !walkResult.wasInterrupted();

 }


 /// Unrolls and jams this loop by the specified factor.

 LogicalResult mlir::loopUnrollJamByFactor(scf::ForOp forOp,

                                           uint64_t unrollJamFactor) {

   assert(unrollJamFactor > 0 && "unroll jam factor should be positive");


   if (unrollJamFactor == 1)

     return success();


   // If any control operand of any inner loop of `forOp` is defined within

   // `forOp`, no unroll jam.

   if (!areInnerBoundsInvariant(forOp)) {

     LDBG() << "failed to unroll and jam: inner bounds are not invariant";

     return failure();

   }


   // Currently, for operations with results are not supported.

   if (forOp->getNumResults() > 0) {

     LDBG() << "failed to unroll and jam: unsupported loop with results";

     return failure();

   }


   // Currently, only constant trip count that divided by the unroll factor is

   // supported.

   std::optional<uint64_t> tripCount = getConstantTripCount(forOp);

   if (!tripCount.has_value()) {

     // If the trip count is dynamic, do not unroll & jam.

     LDBG() << "failed to unroll and jam: trip count could not be determined";

     return failure();

   }

   if (unrollJamFactor > *tripCount) {

     LDBG() << "unroll and jam factor is greater than trip count, set factor to "

               "trip "

               "count";

     unrollJamFactor = *tripCount;

   } else if (*tripCount % unrollJamFactor != 0) {

     LDBG() << "failed to unroll and jam: unsupported trip count that is not a "

               "multiple of unroll jam factor";

     return failure();

   }


   // Nothing in the loop body other than the terminator.

   if (llvm::hasSingleElement(forOp.getBody()->getOperations()))

     return success();


   // Gather all sub-blocks to jam upon the loop being unrolled.

   JamBlockGatherer<scf::ForOp> jbg;

   jbg.walk(forOp);

   auto &subBlocks = jbg.subBlocks;


   // Collect inner loops.

   SmallVector<scf::ForOp> innerLoops;

   forOp.walk([&](scf::ForOp innerForOp) { innerLoops.push_back(innerForOp); });


   // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled

   // iteration. There are (`unrollJamFactor` - 1) iterations.

   SmallVector<IRMapping> operandMaps(unrollJamFactor - 1);


   // For any loop with iter_args, replace it with a new loop that has

   // `unrollJamFactor` copies of its iterOperands, iter_args and yield

   // operands.

   SmallVector<scf::ForOp> newInnerLoops;

   IRRewriter rewriter(forOp.getContext());

   for (scf::ForOp oldForOp : innerLoops) {

     SmallVector<Value> dupIterOperands, dupYieldOperands;

     ValueRange oldIterOperands = oldForOp.getInits();

     ValueRange oldIterArgs = oldForOp.getRegionIterArgs();

     ValueRange oldYieldOperands =

         cast<scf::YieldOp>(oldForOp.getBody()->getTerminator()).getOperands();

     // Get additional iterOperands, iterArgs, and yield operands. We will

     // fix iterOperands and yield operands after cloning of sub-blocks.

     for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {

       dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());

       dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());

     }

     // Create a new loop with additional iterOperands, iter_args and yield

     // operands. This new loop will take the loop body of the original loop.

     bool forOpReplaced = oldForOp == forOp;

     scf::ForOp newForOp =

         cast<scf::ForOp>(*oldForOp.replaceWithAdditionalYields(

             rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,

             [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {

               return dupYieldOperands;

             }));

     newInnerLoops.push_back(newForOp);

     // `forOp` has been replaced with a new loop.

     if (forOpReplaced)

       forOp = newForOp;

     // Update `operandMaps` for `newForOp` iterArgs and results.

     ValueRange newIterArgs = newForOp.getRegionIterArgs();

     unsigned oldNumIterArgs = oldIterArgs.size();

     ValueRange newResults = newForOp.getResults();

     unsigned oldNumResults = newResults.size() / unrollJamFactor;

     assert(oldNumIterArgs == oldNumResults &&

            "oldNumIterArgs must be the same as oldNumResults");

     for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {

       for (unsigned j = 0; j < oldNumIterArgs; ++j) {

         // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and

         // results. Update `operandMaps[i - 1]` to map old iterArgs and results

         // to those in the `i`th new set.

         operandMaps[i - 1].map(newIterArgs[j],

                                newIterArgs[i * oldNumIterArgs + j]);

         operandMaps[i - 1].map(newResults[j],

                                newResults[i * oldNumResults + j]);

       }

     }

   }


   // Scale the step of loop being unroll-jammed by the unroll-jam factor.

   rewriter.setInsertionPoint(forOp);

   int64_t step = forOp.getConstantStep()->getSExtValue();

   auto newStep = rewriter.createOrFold<arith::MulIOp>(

       forOp.getLoc(), forOp.getStep(),

       rewriter.createOrFold<arith::ConstantOp>(

           forOp.getLoc(), rewriter.getIndexAttr(unrollJamFactor)));

   forOp.setStep(newStep);

   auto forOpIV = forOp.getInductionVar();


   // Unroll and jam (appends unrollJamFactor - 1 additional copies).

   for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {

     for (auto &subBlock : subBlocks) {

       // Builder to insert unroll-jammed bodies. Insert right at the end of

       // sub-block.

       OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));


       // If the induction variable is used, create a remapping to the value for

       // this unrolled instance.

       if (!forOpIV.use_empty()) {

         // iv' = iv + i * step, i = 1 to unrollJamFactor-1.

         auto ivTag = builder.createOrFold<arith::ConstantOp>(

             forOp.getLoc(), builder.getIndexAttr(step * i));

         auto ivUnroll =

             builder.createOrFold<arith::AddIOp>(forOp.getLoc(), forOpIV, ivTag);

         operandMaps[i - 1].map(forOpIV, ivUnroll);

       }

       // Clone the sub-block being unroll-jammed.

       for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)

         builder.clone(*it, operandMaps[i - 1]);

     }

     // Fix iterOperands and yield op operands of newly created loops.

     for (auto newForOp : newInnerLoops) {

       unsigned oldNumIterOperands =

           newForOp.getNumRegionIterArgs() / unrollJamFactor;

       unsigned numControlOperands = newForOp.getNumControlOperands();

       auto yieldOp = cast<scf::YieldOp>(newForOp.getBody()->getTerminator());

       unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;

       assert(oldNumIterOperands == oldNumYieldOperands &&

              "oldNumIterOperands must be the same as oldNumYieldOperands");

       for (unsigned j = 0; j < oldNumIterOperands; ++j) {

         // The `i`th duplication of an old iterOperand or yield op operand

         // needs to be replaced with a mapped value from `operandMaps[i - 1]`

         // if such mapped value exists.

         newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,

                             operandMaps[i - 1].lookupOrDefault(

                                 newForOp.getOperand(numControlOperands + j)));

         yieldOp.setOperand(

             i * oldNumYieldOperands + j,

             operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));

       }

     }

   }


   // Promote the loop body up if this has turned into a single iteration loop.

   (void)forOp.promoteIfSingleIteration(rewriter);

   return success();

 }


 Range emitNormalizedLoopBoundsForIndexType(RewriterBase &rewriter, Location loc,

                                            OpFoldResult lb, OpFoldResult ub,

                                            OpFoldResult step) {

   Range normalizedLoopBounds;

   normalizedLoopBounds.offset = rewriter.getIndexAttr(0);

   normalizedLoopBounds.stride = rewriter.getIndexAttr(1);

   AffineExpr s0, s1, s2;

   bindSymbols(rewriter.getContext(), s0, s1, s2);

   AffineExpr e = (s1 - s0).ceilDiv(s2);

   normalizedLoopBounds.size =

       affine::makeComposedFoldedAffineApply(rewriter, loc, e, {lb, ub, step});

   return normalizedLoopBounds;

 }


 Range mlir::emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc,

                                      OpFoldResult lb, OpFoldResult ub,

                                      OpFoldResult step) {

   if (getType(lb).isIndex()) {

     return emitNormalizedLoopBoundsForIndexType(rewriter, loc, lb, ub, step);

   }

   // For non-index types, generate `arith` instructions

   // Check if the loop is already known to have a constant zero lower bound or

   // a constant one step.

   bool isZeroBased = false;

   if (auto lbCst = getConstantIntValue(lb))

     isZeroBased = lbCst.value() == 0;


   bool isStepOne = false;

   if (auto stepCst = getConstantIntValue(step))

     isStepOne = stepCst.value() == 1;


   Type rangeType = getType(lb);

   assert(rangeType == getType(ub) && rangeType == getType(step) &&

          "expected matching types");


   // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)

   // assuming the step is strictly positive.  Update the bounds and the step

   // of the loop to go from 0 to the number of iterations, if necessary.

   if (isZeroBased && isStepOne)

     return {lb, ub, step};


   OpFoldResult diff = ub;

   if (!isZeroBased) {

     diff = rewriter.createOrFold<arith::SubIOp>(

         loc, getValueOrCreateConstantIntOp(rewriter, loc, ub),

         getValueOrCreateConstantIntOp(rewriter, loc, lb));

   }

   OpFoldResult newUpperBound = diff;

   if (!isStepOne) {

     newUpperBound = rewriter.createOrFold<arith::CeilDivSIOp>(

         loc, getValueOrCreateConstantIntOp(rewriter, loc, diff),

         getValueOrCreateConstantIntOp(rewriter, loc, step));

   }


   OpFoldResult newLowerBound = rewriter.getZeroAttr(rangeType);

   OpFoldResult newStep = rewriter.getOneAttr(rangeType);


   return {newLowerBound, newUpperBound, newStep};

 }


 static void denormalizeInductionVariableForIndexType(RewriterBase &rewriter,

                                                      Location loc,

                                                      Value normalizedIv,

                                                      OpFoldResult origLb,

                                                      OpFoldResult origStep) {

   AffineExpr d0, s0, s1;

   bindSymbols(rewriter.getContext(), s0, s1);

   bindDims(rewriter.getContext(), d0);

   AffineExpr e = d0 * s1 + s0;

   OpFoldResult denormalizedIv = affine::makeComposedFoldedAffineApply(

       rewriter, loc, e, ArrayRef<OpFoldResult>{normalizedIv, origLb, origStep});

   Value denormalizedIvVal =

       getValueOrCreateConstantIndexOp(rewriter, loc, denormalizedIv);

   SmallPtrSet<Operation *, 1> preservedUses;

   // If an `affine.apply` operation is generated for denormalization, the use

   // of `origLb` in those ops must not be replaced. These arent not generated

   // when `origLb == 0` and `origStep == 1`.

   if (!isZeroInteger(origLb) || !isOneInteger(origStep)) {

     if (Operation *preservedUse = denormalizedIvVal.getDefiningOp()) {

       preservedUses.insert(preservedUse);

     }

   }

   rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIvVal, preservedUses);

 }


 void mlir::denormalizeInductionVariable(RewriterBase &rewriter, Location loc,

                                         Value normalizedIv, OpFoldResult origLb,

                                         OpFoldResult origStep) {

   if (getType(origLb).isIndex()) {

     return denormalizeInductionVariableForIndexType(rewriter, loc, normalizedIv,

                                                     origLb, origStep);

   }

   Value denormalizedIv;

   SmallPtrSet<Operation *, 2> preserve;

   bool isStepOne = isOneInteger(origStep);

   bool isZeroBased = isZeroInteger(origLb);


   Value scaled = normalizedIv;

   if (!isStepOne) {

     Value origStepValue =

         getValueOrCreateConstantIntOp(rewriter, loc, origStep);

     scaled = arith::MulIOp::create(rewriter, loc, normalizedIv, origStepValue);

     preserve.insert(scaled.getDefiningOp());

   }

   denormalizedIv = scaled;

   if (!isZeroBased) {

     Value origLbValue = getValueOrCreateConstantIntOp(rewriter, loc, origLb);

     denormalizedIv = arith::AddIOp::create(rewriter, loc, scaled, origLbValue);

     preserve.insert(denormalizedIv.getDefiningOp());

   }


   rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIv, preserve);

 }


 static OpFoldResult getProductOfIndexes(RewriterBase &rewriter, Location loc,

                                         ArrayRef<OpFoldResult> values) {

   assert(!values.empty() && "unexecpted empty array");

   AffineExpr s0, s1;

   bindSymbols(rewriter.getContext(), s0, s1);

   AffineExpr mul = s0 * s1;

   OpFoldResult products = rewriter.getIndexAttr(1);

   for (auto v : values) {

     products = affine::makeComposedFoldedAffineApply(

         rewriter, loc, mul, ArrayRef<OpFoldResult>{products, v});

   }

   return products;

 }


 /// Helper function to multiply a sequence of values.

 static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc,

                                        ArrayRef<Value> values) {

   assert(!values.empty() && "unexpected empty list");

   if (getType(values.front()).isIndex()) {

     SmallVector<OpFoldResult> ofrs = getAsOpFoldResult(values);

     OpFoldResult product = getProductOfIndexes(rewriter, loc, ofrs);

     return getValueOrCreateConstantIndexOp(rewriter, loc, product);

   }

   std::optional<Value> productOf;

   for (auto v : values) {

     auto vOne = getConstantIntValue(v);

     if (vOne && vOne.value() == 1)

       continue;

     if (productOf)

       productOf = arith::MulIOp::create(rewriter, loc, productOf.value(), v)

                       .getResult();

     else

       productOf = v;

   }

   if (!productOf) {

     productOf = arith::ConstantOp::create(

                     rewriter, loc, rewriter.getOneAttr(getType(values.front())))

                     .getResult();

   }

   return productOf.value();

 }


 /// For each original loop, the value of the

 /// induction variable can be obtained by dividing the induction variable of

 /// the linearized loop by the total number of iterations of the loops nested

 /// in it modulo the number of iterations in this loop (remove the values

 /// related to the outer loops):

 ///   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.

 /// Compute these iteratively from the innermost loop by creating a "running

 /// quotient" of division by the range.

 static std::pair<SmallVector<Value>, SmallPtrSet<Operation *, 2>>

 delinearizeInductionVariable(RewriterBase &rewriter, Location loc,

                              Value linearizedIv, ArrayRef<Value> ubs) {


   if (linearizedIv.getType().isIndex()) {

     Operation *delinearizedOp = affine::AffineDelinearizeIndexOp::create(

         rewriter, loc, linearizedIv, ubs);

     auto resultVals = llvm::map_to_vector(

         delinearizedOp->getResults(), [](OpResult r) -> Value { return r; });

     return {resultVals, SmallPtrSet<Operation *, 2>{delinearizedOp}};

   }


   SmallVector<Value> delinearizedIvs(ubs.size());

   SmallPtrSet<Operation *, 2> preservedUsers;


   llvm::BitVector isUbOne(ubs.size());

   for (auto [index, ub] : llvm::enumerate(ubs)) {

     auto ubCst = getConstantIntValue(ub);

     if (ubCst && ubCst.value() == 1)

       isUbOne.set(index);

   }


   // Prune the lead ubs that are all ones.

   unsigned numLeadingOneUbs = 0;

   for (auto [index, ub] : llvm::enumerate(ubs)) {

     if (!isUbOne.test(index)) {

       break;

     }

     delinearizedIvs[index] = arith::ConstantOp::create(

         rewriter, loc, rewriter.getZeroAttr(ub.getType()));

     numLeadingOneUbs++;

   }


   Value previous = linearizedIv;

   for (unsigned i = numLeadingOneUbs, e = ubs.size(); i < e; ++i) {

     unsigned idx = ubs.size() - (i - numLeadingOneUbs) - 1;

     if (i != numLeadingOneUbs && !isUbOne.test(idx + 1)) {

       previous = arith::DivSIOp::create(rewriter, loc, previous, ubs[idx + 1]);

       preservedUsers.insert(previous.getDefiningOp());

     }

     Value iv = previous;

     if (i != e - 1) {

       if (!isUbOne.test(idx)) {

         iv = arith::RemSIOp::create(rewriter, loc, previous, ubs[idx]);

         preservedUsers.insert(iv.getDefiningOp());

       } else {

         iv = arith::ConstantOp::create(

             rewriter, loc, rewriter.getZeroAttr(ubs[idx].getType()));

       }

     }

     delinearizedIvs[idx] = iv;

   }

   return {delinearizedIvs, preservedUsers};

 }


 LogicalResult mlir::coalesceLoops(RewriterBase &rewriter,

                                   MutableArrayRef<scf::ForOp> loops) {

   if (loops.size() < 2)

     return failure();


   scf::ForOp innermost = loops.back();

   scf::ForOp outermost = loops.front();


   // 1. Make sure all loops iterate from 0 to upperBound with step 1.  This

   // allows the following code to assume upperBound is the number of iterations.

   for (auto loop : loops) {

     OpBuilder::InsertionGuard g(rewriter);

     rewriter.setInsertionPoint(outermost);

     Value lb = loop.getLowerBound();

     Value ub = loop.getUpperBound();

     Value step = loop.getStep();

     auto newLoopRange =

         emitNormalizedLoopBounds(rewriter, loop.getLoc(), lb, ub, step);


     rewriter.modifyOpInPlace(loop, [&]() {

       loop.setLowerBound(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),

                                                        newLoopRange.offset));

       loop.setUpperBound(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),

                                                        newLoopRange.size));

       loop.setStep(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),

                                                  newLoopRange.stride));

     });

     rewriter.setInsertionPointToStart(innermost.getBody());

     denormalizeInductionVariable(rewriter, loop.getLoc(),

                                  loop.getInductionVar(), lb, step);

   }


   // 2. Emit code computing the upper bound of the coalesced loop as product

   // of the number of iterations of all loops.

   OpBuilder::InsertionGuard g(rewriter);

   rewriter.setInsertionPoint(outermost);

   Location loc = outermost.getLoc();

   SmallVector<Value> upperBounds = llvm::map_to_vector(

       loops, [](auto loop) { return loop.getUpperBound(); });

   Value upperBound = getProductOfIntsOrIndexes(rewriter, loc, upperBounds);

   outermost.setUpperBound(upperBound);


   rewriter.setInsertionPointToStart(innermost.getBody());

   auto [delinearizeIvs, preservedUsers] = delinearizeInductionVariable(

       rewriter, loc, outermost.getInductionVar(), upperBounds);

   rewriter.replaceAllUsesExcept(outermost.getInductionVar(), delinearizeIvs[0],

                                 preservedUsers);


   for (int i = loops.size() - 1; i > 0; --i) {

     auto outerLoop = loops[i - 1];

     auto innerLoop = loops[i];


     Operation *innerTerminator = innerLoop.getBody()->getTerminator();

     auto yieldedVals = llvm::to_vector(innerTerminator->getOperands());

     assert(llvm::equal(outerLoop.getRegionIterArgs(), innerLoop.getInitArgs()));

     for (Value &yieldedVal : yieldedVals) {

       // The yielded value may be an iteration argument of the inner loop

       // which is about to be inlined.

       auto iter = llvm::find(innerLoop.getRegionIterArgs(), yieldedVal);

       if (iter != innerLoop.getRegionIterArgs().end()) {

         unsigned iterArgIndex = iter - innerLoop.getRegionIterArgs().begin();

         // `outerLoop` iter args identical to the `innerLoop` init args.

         assert(iterArgIndex < innerLoop.getInitArgs().size());

         yieldedVal = innerLoop.getInitArgs()[iterArgIndex];

       }

     }

     rewriter.eraseOp(innerTerminator);


     SmallVector<Value> innerBlockArgs;

     innerBlockArgs.push_back(delinearizeIvs[i]);

     llvm::append_range(innerBlockArgs, outerLoop.getRegionIterArgs());

     rewriter.inlineBlockBefore(innerLoop.getBody(), outerLoop.getBody(),

                                Block::iterator(innerLoop), innerBlockArgs);

     rewriter.replaceOp(innerLoop, yieldedVals);

   }

   return success();

 }


 LogicalResult mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {

   if (loops.empty()) {

     return failure();

   }

   IRRewriter rewriter(loops.front().getContext());

   return coalesceLoops(rewriter, loops);

 }


 LogicalResult mlir::coalescePerfectlyNestedSCFForLoops(scf::ForOp op) {

   LogicalResult result(failure());

   SmallVector<scf::ForOp> loops;

   getPerfectlyNestedLoops(loops, op);


   // Look for a band of loops that can be coalesced, i.e. perfectly nested

   // loops with bounds defined above some loop.


   // 1. For each loop, find above which parent loop its bounds operands are

   // defined.

   SmallVector<unsigned> operandsDefinedAbove(loops.size());

   for (unsigned i = 0, e = loops.size(); i < e; ++i) {

     operandsDefinedAbove[i] = i;

     for (unsigned j = 0; j < i; ++j) {

       SmallVector<Value> boundsOperands = {loops[i].getLowerBound(),

                                            loops[i].getUpperBound(),

                                            loops[i].getStep()};

       if (areValuesDefinedAbove(boundsOperands, loops[j].getRegion())) {

         operandsDefinedAbove[i] = j;

         break;

       }

     }

   }


   // 2. For each inner loop check that the iter_args for the immediately outer

   // loop are the init for the immediately inner loop and that the yields of the

   // return of the inner loop is the yield for the immediately outer loop. Keep

   // track of where the chain starts from for each loop.

   SmallVector<unsigned> iterArgChainStart(loops.size());

   iterArgChainStart[0] = 0;

   for (unsigned i = 1, e = loops.size(); i < e; ++i) {

     // By default set the start of the chain to itself.

     iterArgChainStart[i] = i;

     auto outerloop = loops[i - 1];

     auto innerLoop = loops[i];

     if (outerloop.getNumRegionIterArgs() != innerLoop.getNumRegionIterArgs()) {

       continue;

     }

     if (!llvm::equal(outerloop.getRegionIterArgs(), innerLoop.getInitArgs())) {

       continue;

     }

     auto outerloopTerminator = outerloop.getBody()->getTerminator();

     if (!llvm::equal(outerloopTerminator->getOperands(),

                      innerLoop.getResults())) {

       continue;

     }

     iterArgChainStart[i] = iterArgChainStart[i - 1];

   }


   // 3. Identify bands of loops such that the operands of all of them are

   // defined above the first loop in the band.  Traverse the nest bottom-up

   // so that modifications don't invalidate the inner loops.

   for (unsigned end = loops.size(); end > 0; --end) {

     unsigned start = 0;

     for (; start < end - 1; ++start) {

       auto maxPos =

           *std::max_element(std::next(operandsDefinedAbove.begin(), start),

                             std::next(operandsDefinedAbove.begin(), end));

       if (maxPos > start)

         continue;

       if (iterArgChainStart[end - 1] > start)

         continue;

       auto band = llvm::MutableArrayRef(loops.data() + start, end - start);

       if (succeeded(coalesceLoops(band)))

         result = success();

       break;

     }

     // If a band was found and transformed, keep looking at the loops above

     // the outermost transformed loop.

     if (start != end - 1)

       end = start + 1;

   }

   return result;

 }


 void mlir::collapseParallelLoops(

     RewriterBase &rewriter, scf::ParallelOp loops,

     ArrayRef<std::vector<unsigned>> combinedDimensions) {

   OpBuilder::InsertionGuard g(rewriter);

   rewriter.setInsertionPoint(loops);

   Location loc = loops.getLoc();


   // Presort combined dimensions.

   auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);

   for (auto &dims : sortedDimensions)

     llvm::sort(dims);


   // Normalize ParallelOp's iteration pattern.

   SmallVector<Value, 3> normalizedUpperBounds;

   for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {

     OpBuilder::InsertionGuard g2(rewriter);

     rewriter.setInsertionPoint(loops);

     Value lb = loops.getLowerBound()[i];

     Value ub = loops.getUpperBound()[i];

     Value step = loops.getStep()[i];

     auto newLoopRange = emitNormalizedLoopBounds(rewriter, loc, lb, ub, step);

     normalizedUpperBounds.push_back(getValueOrCreateConstantIntOp(

         rewriter, loops.getLoc(), newLoopRange.size));


     rewriter.setInsertionPointToStart(loops.getBody());

     denormalizeInductionVariable(rewriter, loc, loops.getInductionVars()[i], lb,

                                  step);

   }


   // Combine iteration spaces.

   SmallVector<Value, 3> lowerBounds, upperBounds, steps;

   auto cst0 = arith::ConstantIndexOp::create(rewriter, loc, 0);

   auto cst1 = arith::ConstantIndexOp::create(rewriter, loc, 1);

   for (auto &sortedDimension : sortedDimensions) {

     Value newUpperBound = arith::ConstantIndexOp::create(rewriter, loc, 1);

     for (auto idx : sortedDimension) {

       newUpperBound = arith::MulIOp::create(rewriter, loc, newUpperBound,

                                             normalizedUpperBounds[idx]);

     }

     lowerBounds.push_back(cst0);

     steps.push_back(cst1);

     upperBounds.push_back(newUpperBound);

   }


   // Create new ParallelLoop with conversions to the original induction values.

   // The loop below uses divisions to get the relevant range of values in the

   // new induction value that represent each range of the original induction

   // value. The remainders then determine based on that range, which iteration

   // of the original induction value this represents. This is a normalized value

   // that is un-normalized already by the previous logic.

   auto newPloop = scf::ParallelOp::create(

       rewriter, loc, lowerBounds, upperBounds, steps,

       [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {

         for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {

           Value previous = ploopIVs[i];

           unsigned numberCombinedDimensions = combinedDimensions[i].size();

           // Iterate over all except the last induction value.

           for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {

             unsigned idx = combinedDimensions[i][j];


             // Determine the current induction value's current loop iteration

             Value iv = arith::RemSIOp::create(insideBuilder, loc, previous,

                                               normalizedUpperBounds[idx]);

             replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,

                                        loops.getRegion());


             // Remove the effect of the current induction value to prepare for

             // the next value.

             previous = arith::DivSIOp::create(insideBuilder, loc, previous,

                                               normalizedUpperBounds[idx]);

           }


           // The final induction value is just the remaining value.

           unsigned idx = combinedDimensions[i][0];

           replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),

                                      previous, loops.getRegion());

         }

       });


   // Replace the old loop with the new loop.

   loops.getBody()->back().erase();

   newPloop.getBody()->getOperations().splice(

       Block::iterator(newPloop.getBody()->back()),

       loops.getBody()->getOperations());

   loops.erase();

 }


 // Hoist the ops within `outer` that appear before `inner`.

 // Such ops include the ops that have been introduced by parametric tiling.

 // Ops that come from triangular loops (i.e. that belong to the program slice

 // rooted at `outer`) and ops that have side effects cannot be hoisted.

 // Return failure when any op fails to hoist.

 static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {

   SetVector<Operation *> forwardSlice;

   ForwardSliceOptions options;

   options.filter = [&inner](Operation *op) {

     return op != inner.getOperation();

   };

   getForwardSlice(outer.getInductionVar(), &forwardSlice, options);

   LogicalResult status = success();

   SmallVector<Operation *, 8> toHoist;

   for (auto &op : outer.getBody()->without_terminator()) {

     // Stop when encountering the inner loop.

     if (&op == inner.getOperation())

       break;

     // Skip over non-hoistable ops.

     if (forwardSlice.count(&op) > 0) {

       status = failure();

       continue;

     }

     // Skip intermediate scf::ForOp, these are not considered a failure.

     if (isa<scf::ForOp>(op))

       continue;

     // Skip other ops with regions.

     if (op.getNumRegions() > 0) {

       status = failure();

       continue;

     }

     // Skip if op has side effects.

     // TODO: loads to immutable memory regions are ok.

     if (!isMemoryEffectFree(&op)) {

       status = failure();

       continue;

     }

     toHoist.push_back(&op);

   }

   auto *outerForOp = outer.getOperation();

   for (auto *op : toHoist)

     op->moveBefore(outerForOp);

   return status;

 }


 // Traverse the interTile and intraTile loops and try to hoist ops such that

 // bands of perfectly nested loops are isolated.

 // Return failure if either perfect interTile or perfect intraTile bands cannot

 // be formed.

 static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {

   LogicalResult status = success();

   const Loops &interTile = tileLoops.first;

   const Loops &intraTile = tileLoops.second;

   auto size = interTile.size();

   assert(size == intraTile.size());

   if (size <= 1)

     return success();

   for (unsigned s = 1; s < size; ++s)

     status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])

                                : failure();

   for (unsigned s = 1; s < size; ++s)

     status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])

                                : failure();

   return status;

 }


 /// Collect perfectly nested loops starting from `rootForOps`.  Loops are

 /// perfectly nested if each loop is the first and only non-terminator operation

 /// in the parent loop.  Collect at most `maxLoops` loops and append them to

 /// `forOps`.

 template <typename T>

 static void getPerfectlyNestedLoopsImpl(

     SmallVectorImpl<T> &forOps, T rootForOp,

     unsigned maxLoops = std::numeric_limits<unsigned>::max()) {

   for (unsigned i = 0; i < maxLoops; ++i) {

     forOps.push_back(rootForOp);

     Block &body = rootForOp.getRegion().front();

     if (body.begin() != std::prev(body.end(), 2))

       return;


     rootForOp = dyn_cast<T>(&body.front());

     if (!rootForOp)

       return;

   }

 }


 static Loops stripmineSink(scf::ForOp forOp, Value factor,

                            ArrayRef<scf::ForOp> targets) {

   auto originalStep = forOp.getStep();

   auto iv = forOp.getInductionVar();


   OpBuilder b(forOp);

   forOp.setStep(arith::MulIOp::create(b, forOp.getLoc(), originalStep, factor));


   Loops innerLoops;

   for (auto t : targets) {

     // Save information for splicing ops out of t when done

     auto begin = t.getBody()->begin();

     auto nOps = t.getBody()->getOperations().size();


     // Insert newForOp before the terminator of `t`.

     auto b = OpBuilder::atBlockTerminator((t.getBody()));

     Value stepped = arith::AddIOp::create(b, t.getLoc(), iv, forOp.getStep());

     Value ub =

         arith::MinSIOp::create(b, t.getLoc(), forOp.getUpperBound(), stepped);


     // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.

     auto newForOp = scf::ForOp::create(b, t.getLoc(), iv, ub, originalStep);

     newForOp.getBody()->getOperations().splice(

         newForOp.getBody()->getOperations().begin(),

         t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));

     replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),

                                newForOp.getRegion());


     innerLoops.push_back(newForOp);

   }


   return innerLoops;

 }


 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.

 // Returns the new for operation, nested immediately under `target`.

 template <typename SizeType>

 static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor,

                                 scf::ForOp target) {

   // TODO: Use cheap structural assertions that targets are nested under

   // forOp and that targets are not nested under each other when DominanceInfo

   // exposes the capability. It seems overkill to construct a whole function

   // dominance tree at this point.

   auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target));

   assert(res.size() == 1 && "Expected 1 inner forOp");

   return res[0];

 }


 SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps,

                                  ArrayRef<Value> sizes,

                                  ArrayRef<scf::ForOp> targets) {

   SmallVector<SmallVector<scf::ForOp, 8>, 8> res;

   SmallVector<scf::ForOp, 8> currentTargets(targets);

   for (auto it : llvm::zip(forOps, sizes)) {

     auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);

     res.push_back(step);

     currentTargets = step;

   }

   return res;

 }


 Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,

                  scf::ForOp target) {

   SmallVector<scf::ForOp, 8> res;

   for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target)))

     res.push_back(llvm::getSingleElement(loops));

   return res;

 }


 Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {

   // Collect perfectly nested loops.  If more size values provided than nested

   // loops available, truncate `sizes`.

   SmallVector<scf::ForOp, 4> forOps;

   forOps.reserve(sizes.size());

   getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());

   if (forOps.size() < sizes.size())

     sizes = sizes.take_front(forOps.size());


   return ::tile(forOps, sizes, forOps.back());

 }


 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,

                                    scf::ForOp root) {

   getPerfectlyNestedLoopsImpl(nestedLoops, root);

 }


 TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,

                                        ArrayRef<int64_t> sizes) {

   // Collect perfectly nested loops.  If more size values provided than nested

   // loops available, truncate `sizes`.

   SmallVector<scf::ForOp, 4> forOps;

   forOps.reserve(sizes.size());

   getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());

   if (forOps.size() < sizes.size())

     sizes = sizes.take_front(forOps.size());


   // Compute the tile sizes such that i-th outer loop executes size[i]

   // iterations.  Given that the loop current executes

   //   numIterations = ceildiv((upperBound - lowerBound), step)

   // iterations, we need to tile with size ceildiv(numIterations, size[i]).

   SmallVector<Value, 4> tileSizes;

   tileSizes.reserve(sizes.size());

   for (unsigned i = 0, e = sizes.size(); i < e; ++i) {

     assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");


     auto forOp = forOps[i];

     OpBuilder builder(forOp);

     auto loc = forOp.getLoc();

     Value diff = arith::SubIOp::create(builder, loc, forOp.getUpperBound(),

                                        forOp.getLowerBound());

     Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());

     Value iterationsPerBlock =

         ceilDivPositive(builder, loc, numIterations, sizes[i]);

     tileSizes.push_back(iterationsPerBlock);

   }


   // Call parametric tiling with the given sizes.

   auto intraTile = tile(forOps, tileSizes, forOps.back());

   TileLoops tileLoops = std::make_pair(forOps, intraTile);


   // TODO: for now we just ignore the result of band isolation.

   // In the future, mapping decisions may be impacted by the ability to

   // isolate perfectly nested bands.

   (void)tryIsolateBands(tileLoops);


   return tileLoops;

 }


 scf::ForallOp mlir::fuseIndependentSiblingForallLoops(scf::ForallOp target,

                                                       scf::ForallOp source,

                                                       RewriterBase &rewriter) {

   unsigned numTargetOuts = target.getNumResults();

   unsigned numSourceOuts = source.getNumResults();


   // Create fused shared_outs.

   SmallVector<Value> fusedOuts;

   llvm::append_range(fusedOuts, target.getOutputs());

   llvm::append_range(fusedOuts, source.getOutputs());


   // Create a new scf.forall op after the source loop.

   rewriter.setInsertionPointAfter(source);

   scf::ForallOp fusedLoop = scf::ForallOp::create(

       rewriter, source.getLoc(), source.getMixedLowerBound(),

       source.getMixedUpperBound(), source.getMixedStep(), fusedOuts,

       source.getMapping());


   // Map control operands.

   IRMapping mapping;

   mapping.map(target.getInductionVars(), fusedLoop.getInductionVars());

   mapping.map(source.getInductionVars(), fusedLoop.getInductionVars());


   // Map shared outs.

   mapping.map(target.getRegionIterArgs(),

               fusedLoop.getRegionIterArgs().take_front(numTargetOuts));

   mapping.map(source.getRegionIterArgs(),

               fusedLoop.getRegionIterArgs().take_back(numSourceOuts));


   // Append everything except the terminator into the fused operation.

   rewriter.setInsertionPointToStart(fusedLoop.getBody());

   for (Operation &op : target.getBody()->without_terminator())

     rewriter.clone(op, mapping);

   for (Operation &op : source.getBody()->without_terminator())

     rewriter.clone(op, mapping);


   // Fuse the old terminator in_parallel ops into the new one.

   scf::InParallelOp targetTerm = target.getTerminator();

   scf::InParallelOp sourceTerm = source.getTerminator();

   scf::InParallelOp fusedTerm = fusedLoop.getTerminator();

   rewriter.setInsertionPointToStart(fusedTerm.getBody());

   for (Operation &op : targetTerm.getYieldingOps())

     rewriter.clone(op, mapping);

   for (Operation &op : sourceTerm.getYieldingOps())

     rewriter.clone(op, mapping);


   // Replace old loops by substituting their uses by results of the fused loop.

   rewriter.replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));

   rewriter.replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));


   return fusedLoop;

 }


 scf::ForOp mlir::fuseIndependentSiblingForLoops(scf::ForOp target,

                                                 scf::ForOp source,

                                                 RewriterBase &rewriter) {

   unsigned numTargetOuts = target.getNumResults();

   unsigned numSourceOuts = source.getNumResults();


   // Create fused init_args, with target's init_args before source's init_args.

   SmallVector<Value> fusedInitArgs;

   llvm::append_range(fusedInitArgs, target.getInitArgs());

   llvm::append_range(fusedInitArgs, source.getInitArgs());


   // Create a new scf.for op after the source loop (with scf.yield terminator

   // (without arguments) only in case its init_args is empty).

   rewriter.setInsertionPointAfter(source);

   scf::ForOp fusedLoop = scf::ForOp::create(

       rewriter, source.getLoc(), source.getLowerBound(), source.getUpperBound(),

       source.getStep(), fusedInitArgs);


   // Map original induction variables and operands to those of the fused loop.

   IRMapping mapping;

   mapping.map(target.getInductionVar(), fusedLoop.getInductionVar());

   mapping.map(target.getRegionIterArgs(),

               fusedLoop.getRegionIterArgs().take_front(numTargetOuts));

   mapping.map(source.getInductionVar(), fusedLoop.getInductionVar());

   mapping.map(source.getRegionIterArgs(),

               fusedLoop.getRegionIterArgs().take_back(numSourceOuts));


   // Merge target's body into the new (fused) for loop and then source's body.

   rewriter.setInsertionPointToStart(fusedLoop.getBody());

   for (Operation &op : target.getBody()->without_terminator())

     rewriter.clone(op, mapping);

   for (Operation &op : source.getBody()->without_terminator())

     rewriter.clone(op, mapping);


   // Build fused yield results by appropriately mapping original yield operands.

   SmallVector<Value> yieldResults;

   for (Value operand : target.getBody()->getTerminator()->getOperands())

     yieldResults.push_back(mapping.lookupOrDefault(operand));

   for (Value operand : source.getBody()->getTerminator()->getOperands())

     yieldResults.push_back(mapping.lookupOrDefault(operand));

   if (!yieldResults.empty())

     scf::YieldOp::create(rewriter, source.getLoc(), yieldResults);


   // Replace old loops by substituting their uses by results of the fused loop.

   rewriter.replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));

   rewriter.replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));


   return fusedLoop;

 }


 FailureOr<scf::ForallOp> mlir::normalizeForallOp(RewriterBase &rewriter,

                                                  scf::ForallOp forallOp) {

   SmallVector<OpFoldResult> lbs = forallOp.getMixedLowerBound();

   SmallVector<OpFoldResult> ubs = forallOp.getMixedUpperBound();

   SmallVector<OpFoldResult> steps = forallOp.getMixedStep();


   if (forallOp.isNormalized())

     return forallOp;


   OpBuilder::InsertionGuard g(rewriter);

   auto loc = forallOp.getLoc();

   rewriter.setInsertionPoint(forallOp);

   SmallVector<OpFoldResult> newUbs;

   for (auto [lb, ub, step] : llvm::zip_equal(lbs, ubs, steps)) {

     Range normalizedLoopParams =

         emitNormalizedLoopBounds(rewriter, loc, lb, ub, step);

     newUbs.push_back(normalizedLoopParams.size);

   }

   (void)foldDynamicIndexList(newUbs);


   // Use the normalized builder since the lower bounds are always 0 and the

   // steps are always 1.

   auto normalizedForallOp = scf::ForallOp::create(

       rewriter, loc, newUbs, forallOp.getOutputs(), forallOp.getMapping(),

       [](OpBuilder &, Location, ValueRange) {});


   rewriter.inlineRegionBefore(forallOp.getBodyRegion(),

                               normalizedForallOp.getBodyRegion(),

                               normalizedForallOp.getBodyRegion().begin());

   // Remove the original empty block in the new loop.

   rewriter.eraseBlock(&normalizedForallOp.getBodyRegion().back());


   rewriter.setInsertionPointToStart(normalizedForallOp.getBody());

   // Update the users of the original loop variables.

   for (auto [idx, iv] :

        llvm::enumerate(normalizedForallOp.getInductionVars())) {

     auto origLb = getValueOrCreateConstantIndexOp(rewriter, loc, lbs[idx]);

     auto origStep = getValueOrCreateConstantIndexOp(rewriter, loc, steps[idx]);

     denormalizeInductionVariable(rewriter, loc, iv, origLb, origStep);

   }


   rewriter.replaceOp(forallOp, normalizedForallOp);

   return normalizedForallOp;

 }

AffineOps.h

Utils.h

getConstantTripCount
static std::optional< int64_t > getConstantTripCount(scf::ForOp forOp)
Returns the trip count of forOp if its' low bound, high bound and step are constants,...
Definition: Utils.cpp:297

getProductOfIndexes
static OpFoldResult getProductOfIndexes(RewriterBase &rewriter, Location loc, ArrayRef< OpFoldResult > values)
Definition: Utils.cpp:795

tryIsolateBands
static LogicalResult tryIsolateBands(const TileLoops &tileLoops)
Definition: Utils.cpp:1197

getPerfectlyNestedLoopsImpl
static void getPerfectlyNestedLoopsImpl(SmallVectorImpl< T > &forOps, T rootForOp, unsigned maxLoops=std::numeric_limits< unsigned >::max())
Collect perfectly nested loops starting from rootForOps.
Definition: Utils.cpp:1219

hoistOpsBetween
static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner)
Definition: Utils.cpp:1153

generateUnrolledLoop
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of scf::ForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor',...
Definition: Utils.cpp:306

stripmineSink
static Loops stripmineSink(scf::ForOp forOp, Value factor, ArrayRef< scf::ForOp > targets)
Definition: Utils.cpp:1234

delinearizeInductionVariable
static std::pair< SmallVector< Value >, SmallPtrSet< Operation *, 2 > > delinearizeInductionVariable(RewriterBase &rewriter, Location loc, Value linearizedIv, ArrayRef< Value > ubs)
For each original loop, the value of the induction variable can be obtained by dividing the induction...
Definition: Utils.cpp:846

ceilDivPositive
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, int64_t divisor)
Definition: Utils.cpp:265

getProductOfIntsOrIndexes
static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc, ArrayRef< Value > values)
Helper function to multiply a sequence of values.
Definition: Utils.cpp:810

denormalizeInductionVariableForIndexType
static void denormalizeInductionVariableForIndexType(RewriterBase &rewriter, Location loc, Value normalizedIv, OpFoldResult origLb, OpFoldResult origStep)
Definition: Utils.cpp:741

emitNormalizedLoopBoundsForIndexType
Range emitNormalizedLoopBoundsForIndexType(RewriterBase &rewriter, Location loc, OpFoldResult lb, OpFoldResult ub, OpFoldResult step)
Definition: Utils.cpp:681

areInnerBoundsInvariant
static bool areInnerBoundsInvariant(scf::ForOp forOp)
Check if bounds of all inner loops are defined outside of forOp and return false if not.
Definition: Utils.cpp:503

Utils.h

FuncOps.h

product
static int64_t product(ArrayRef< int64_t > vals)
Definition: GPUHeuristics.cpp:112

IRMapping.h

OpDefinition.h

options
static llvm::ManagedStatic< PassManagerOptions > options
Definition: PassManagerOptions.cpp:89

PatternMatch.h

max
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Definition: PolynomialApproximation.cpp:212

RegionUtils.h

SideEffectInterfaces.h

SliceAnalysis.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::MutableArrayRef
Definition: LLVM.h:62

llvm::SetVector
Definition: LLVM.h:66

llvm::SmallPtrSet
Definition: LLVM.h:68

llvm::SmallVectorImpl
Definition: LLVM.h:74

llvm::SmallVector
Definition: LLVM.h:72

llvm::function_ref
Definition: LLVM.h:90

mlir::AffineExpr
Base type for affine expression.
Definition: AffineExpr.h:68

mlir::BlockArgument
This class represents an argument of a Block.
Definition: Value.h:309

mlir::Block
Block represents an ordered list of Operations.
Definition: Block.h:33

mlir::Block::iterator
OpListType::iterator iterator
Definition: Block.h:140

mlir::Block::getNumArguments
unsigned getNumArguments()
Definition: Block.h:128

mlir::Block::getTerminator
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:244

mlir::Block::getArguments
BlockArgListType getArguments()
Definition: Block.h:87

mlir::Block::front
Operation & front()
Definition: Block.h:153

mlir::Block::end
iterator end()
Definition: Block.h:144

mlir::Block::begin
iterator begin()
Definition: Block.h:143

mlir::Builder::getIndexAttr
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:103

mlir::Builder::getIntegerAttr
IntegerAttr getIntegerAttr(Type type, int64_t value)
Definition: Builders.cpp:223

mlir::Builder::getZeroAttr
TypedAttr getZeroAttr(Type type)
Definition: Builders.cpp:319

mlir::Builder::getContext
MLIRContext * getContext() const
Definition: Builders.h:55

mlir::Builder::getOneAttr
TypedAttr getOneAttr(Type type)
Definition: Builders.cpp:337

mlir::IRMapping
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26

mlir::IRMapping::lookupOrDefault
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:65

mlir::IRMapping::map
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30

mlir::IRRewriter
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:750

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::OpBuilder::InsertionGuard
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:346

mlir::OpBuilder
This class helps build Operations.
Definition: Builders.h:205

mlir::OpBuilder::createBlock
Block * createBlock(Region *parent, Region::iterator insertPt={}, TypeRange argTypes={}, ArrayRef< Location > locs={})
Add new block with 'argTypes' arguments and set the insertion point to the end of it.
Definition: Builders.cpp:425

mlir::OpBuilder::clone
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:548

mlir::OpBuilder::setInsertionPointToStart
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:429

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:396

mlir::OpBuilder::atBlockTerminator
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:250

mlir::OpBuilder::setInsertionPointToEnd
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:434

mlir::OpBuilder::createOrFold
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:517

mlir::OpBuilder::setInsertionPointAfter
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410

mlir::OpFoldResult
This class represents a single result from folding an operation.
Definition: OpDefinition.h:272

mlir::OpOperand
This class represents an operand of an operation.
Definition: Value.h:257

mlir::OpResult
This is a value defined by a result of an operation.
Definition: Value.h:447

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::clone
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:718

mlir::Operation::getResult
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:407

mlir::Operation::getRegions
MutableArrayRef< Region > getRegions()
Returns the regions held by this operation.
Definition: Operation.h:677

mlir::Operation::getOperandTypes
operand_type_range getOperandTypes()
Definition: Operation.h:397

mlir::Operation::getResultTypes
result_type_range getResultTypes()
Definition: Operation.h:428

mlir::Operation::getOperands
operand_range getOperands()
Returns an iterator on the underlying Value's.
Definition: Operation.h:378

mlir::Operation::setOperands
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:236

mlir::Operation::getResults
result_range getResults()
Definition: Operation.h:415

mlir::Region
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition: Region.h:26

mlir::Region::getArguments
BlockArgListType getArguments()
Definition: Region.h:81

mlir::Region::begin
iterator begin()
Definition: Region.h:55

mlir::Region::front
Block & front()
Definition: Region.h:65

mlir::Region::getParentOfType
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205

mlir::Region::hasOneBlock
bool hasOneBlock()
Return true if this region has exactly one block.
Definition: Region.h:68

mlir::RewriterBase
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:358

mlir::RewriterBase::eraseBlock
virtual void eraseBlock(Block *block)
This method erases all operations in a block.
Definition: PatternMatch.cpp:227

mlir::RewriterBase::replaceOp
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
Definition: PatternMatch.cpp:127

mlir::RewriterBase::eraseOp
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Definition: PatternMatch.cpp:155

mlir::RewriterBase::replaceAllUsesExcept
void replaceAllUsesExcept(Value from, Value to, Operation *exceptedUser)
Find uses of from and replace them with to except if the user is exceptedUser.
Definition: PatternMatch.h:686

mlir::RewriterBase::inlineBlockBefore
virtual void inlineBlockBefore(Block *source, Block *dest, Block::iterator before, ValueRange argValues={})
Inline the operations of block 'source' into block 'dest' before the given position.
Definition: PatternMatch.cpp:285

mlir::RewriterBase::mergeBlocks
void mergeBlocks(Block *source, Block *dest, ValueRange argValues={})
Inline the operations of block 'source' into the end of block 'dest'.
Definition: PatternMatch.cpp:333

mlir::RewriterBase::modifyOpInPlace
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
Definition: PatternMatch.h:614

mlir::RewriterBase::inlineRegionBefore
void inlineRegionBefore(Region &region, Region &parent, Region::iterator before)
Move the blocks that belong to "region" before the given position in another region "parent".
Definition: PatternMatch.cpp:366

mlir::TypeRange
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:37

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::Type::isIndex
bool isIndex() const
Definition: Types.cpp:54

mlir::Type::isIntOrIndex
bool isIntOrIndex() const
Return true if this is an integer (of any signedness) or an index type.
Definition: Types.cpp:112

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::use_empty
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:208

mlir::Value::replaceUsesWithIf
void replaceUsesWithIf(Value newValue, function_ref< bool(OpOperand &)> shouldReplace)
Replace all uses of 'this' value with 'newValue' if the given callback returns true.
Definition: Value.cpp:91

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:105

mlir::Value::getDefiningOp
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:18

mlir::WalkResult::advance
static WalkResult advance()
Definition: WalkResult.h:47

mlir::WalkResult::interrupt
static WalkResult interrupt()
Definition: WalkResult.h:46

mlir::arith::ConstantIndexOp
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:113

mlir::arith::ConstantIndexOp::create
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359

mlir::detail::IROperandBase::getOwner
Operation * getOwner() const
Return the owner of this operand.
Definition: UseDefLists.h:38

Arith.h

SCF.h

mlir::affine::makeComposedFoldedAffineApply
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Definition: AffineOps.cpp:1331

mlir::affine::tile
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1575

mlir::detail::enumerate
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::getPerfectlyNestedLoops
void getPerfectlyNestedLoops(SmallVectorImpl< scf::ForOp > &nestedLoops, scf::ForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: Utils.cpp:1315

mlir::outlineIfOp
LogicalResult outlineIfOp(RewriterBase &b, scf::IfOp ifOp, func::FuncOp *thenFn, StringRef thenFnName, func::FuncOp *elseFn, StringRef elseFnName)
Outline the then and/or else regions of ifOp as follows:
Definition: Utils.cpp:217

mlir::replaceAllUsesInRegionWith
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:32

mlir::replaceLoopNestWithNewYields
SmallVector< scf::ForOp > replaceLoopNestWithNewYields(RewriterBase &rewriter, MutableArrayRef< scf::ForOp > loopNest, ValueRange newIterOperands, const NewYieldValuesFn &newYieldValuesFn, bool replaceIterOperandsUsesInLoop=true)
Update a perfectly nested loop nest to yield new values from the innermost loop and propagating it up...
Definition: Utils.cpp:34

mlir::getConstantIntValue
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
Definition: StaticValueUtils.cpp:115

mlir::getType
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
Definition: Utils.cpp:304

mlir::coalescePerfectlyNestedSCFForLoops
LogicalResult coalescePerfectlyNestedSCFForLoops(scf::ForOp op)
Walk an affine.for to find a band to coalesce.
Definition: Utils.cpp:986

mlir::bindDims
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:311

mlir::TileLoops
std::pair< Loops, Loops > TileLoops
Definition: Utils.h:155

mlir::getValueOrCreateConstantIntOp
Value getValueOrCreateConstantIntOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:102

mlir::loopUnrollFull
LogicalResult loopUnrollFull(scf::ForOp forOp)
Unrolls this loop completely.
Definition: Utils.cpp:488

mlir::collapseParallelLoops
void collapseParallelLoops(RewriterBase &rewriter, scf::ParallelOp loops, ArrayRef< std::vector< unsigned >> combinedDimensions)
Take the ParallelLoop and for each set of dimension indices, combine them into a single dimension.
Definition: Utils.cpp:1061

mlir::isMemoryEffectFree
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
Definition: SideEffectInterfaces.cpp:315

mlir::constantTripCount
std::optional< int64_t > constantTripCount(OpFoldResult lb, OpFoldResult ub, OpFoldResult step)
Return the number of iterations for a loop with a lower bound lb, upper bound ub and step step.
Definition: StaticValueUtils.cpp:267

mlir::NewYieldValuesFn
std::function< SmallVector< Value >(OpBuilder &b, Location loc, ArrayRef< BlockArgument > newBbArgs)> NewYieldValuesFn
A function that returns the additional yielded values during replaceWithAdditionalYields.
Definition: LoopLikeInterface.h:26

mlir::tilePerfectlyNested
Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef< Value > sizes)
Tile a nest of scf::ForOp loops rooted at rootForOp with the given (parametric) sizes.
Definition: Utils.cpp:1303

mlir::loopUnrollByFactor
FailureOr< UnrolledLoopInfo > loopUnrollByFactor(scf::ForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr)
Unrolls this for operation by the specified unroll factor.
Definition: Utils.cpp:361

mlir::loopUnrollJamByFactor
LogicalResult loopUnrollJamByFactor(scf::ForOp forOp, uint64_t unrollFactor)
Unrolls and jams this scf.for operation by the specified unroll factor.
Definition: Utils.cpp:516

mlir::getInnermostParallelLoops
bool getInnermostParallelLoops(Operation *rootOp, SmallVectorImpl< scf::ParallelOp > &result)
Get a list of innermost parallel loops contained in rootOp.
Definition: Utils.cpp:240

mlir::isZeroInteger
bool isZeroInteger(OpFoldResult v)
Return true if v is an IntegerAttr with value 0.
Definition: StaticValueUtils.cpp:18

mlir::bindSymbols
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325

mlir::getUsedValuesDefinedAbove
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Definition: RegionUtils.cpp:67

mlir::getValueOrCreateConstantIndexOp
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:111

mlir::tile
SmallVector< Loops, 8 > tile(ArrayRef< scf::ForOp > forOps, ArrayRef< Value > sizes, ArrayRef< scf::ForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: Utils.cpp:1282

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::outlineSingleBlockRegion
FailureOr< func::FuncOp > outlineSingleBlockRegion(RewriterBase &rewriter, Location loc, Region &region, StringRef funcName, func::CallOp *callOp=nullptr)
Outline a region with a single block into a new FuncOp.
Definition: Utils.cpp:113

mlir::getAsOpFoldResult
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
Definition: StaticValueUtils.cpp:79

mlir::areValuesDefinedAbove
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:26

mlir::denormalizeInductionVariable
void denormalizeInductionVariable(RewriterBase &rewriter, Location loc, Value normalizedIv, OpFoldResult origLb, OpFoldResult origStep)
Get back the original induction variable values after loop normalization.
Definition: Utils.cpp:766

mlir::fuseIndependentSiblingForallLoops
scf::ForallOp fuseIndependentSiblingForallLoops(scf::ForallOp target, scf::ForallOp source, RewriterBase &rewriter)
Given two scf.forall loops, target and source, fuses target into source.
Definition: Utils.cpp:1362

mlir::coalesceLoops
LogicalResult coalesceLoops(MutableArrayRef< scf::ForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: Utils.cpp:978

mlir::fuseIndependentSiblingForLoops
scf::ForOp fuseIndependentSiblingForLoops(scf::ForOp target, scf::ForOp source, RewriterBase &rewriter)
Given two scf.for loops, target and source, fuses target into source.
Definition: Utils.cpp:1415

mlir::extractFixedOuterLoops
TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef< int64_t > sizes)
Definition: Utils.cpp:1320

mlir::emitNormalizedLoopBounds
Range emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc, OpFoldResult lb, OpFoldResult ub, OpFoldResult step)
Materialize bounds and step of a zero-based and unit-step loop derived by normalizing the specified b...
Definition: Utils.cpp:695

mlir::isOneInteger
bool isOneInteger(OpFoldResult v)
Return true if v is an IntegerAttr with value 1.
Definition: StaticValueUtils.cpp:20

mlir::foldDynamicIndexList
LogicalResult foldDynamicIndexList(SmallVectorImpl< OpFoldResult > &ofrs, bool onlyNonNegative=false, bool onlyNonZero=false)
Returns "success" when any of the elements in ofrs is a constant value.
Definition: StaticValueUtils.cpp:297

mlir::normalizeForallOp
FailureOr< scf::ForallOp > normalizeForallOp(RewriterBase &rewriter, scf::ForallOp forallOp)
Normalize an scf.forall operation.
Definition: Utils.cpp:1465

mlir::getForwardSlice
void getForwardSlice(Operation *op, SetVector< Operation * > *forwardSlice, const ForwardSliceOptions &options={})
Fills forwardSlice with the computed forward slice (i.e.
Definition: SliceAnalysis.cpp:74

mlir::JamBlockGatherer
Definition: LoopLikeInterface.h:56

mlir::JamBlockGatherer::walk
void walk(Operation *op)
Definition: LoopLikeInterface.h:61

mlir::JamBlockGatherer::subBlocks
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Definition: LoopLikeInterface.h:58

mlir::Range
Represents a range (offset, size, and stride) where each element of the triple may be dynamic or stat...
Definition: StaticValueUtils.h:35

mlir::Range::stride
OpFoldResult stride
Definition: StaticValueUtils.h:38

mlir::Range::size
OpFoldResult size
Definition: StaticValueUtils.h:37

mlir::Range::offset
OpFoldResult offset
Definition: StaticValueUtils.h:36

mlir::SliceOptions
Definition: SliceAnalysis.h:23

mlir::UnrolledLoopInfo
Definition: Utils.h:114

mlir::UnrolledLoopInfo::epilogueLoopOp
std::optional< scf::ForOp > epilogueLoopOp
Definition: Utils.h:116

mlir::UnrolledLoopInfo::mainLoopOp
std::optional< scf::ForOp > mainLoopOp
Definition: Utils.h:115

j
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.