doxygen/ACCComputeLowering_8cpp_source.html

//===- ACCComputeLowering.cpp - Lower ACC compute to compute_region -------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This pass decomposes OpenACC compute constructs into a representation that

// separates the data environment from the compute portion and prepares for

// parallelism assignment and privatization at the appropriate level.

//

// Overview:

// ---------

// Each compute construct (`acc.parallel`, `acc.serial`, `acc.kernels`) is

// lowered to (1) `acc.kernel_environment`, which captures the data environment

// and (2) `acc.compute_region`, which holds the compute body. Inside the

// compute region, acc.loop is converted to SCF loops (`scf.parallel` or

// `scf.for`) with any predetermined parallelism expressed as `par_dims`. This

// decomposition allows later phases to assign parallelism and handle

// privatization at the right granularity.

//

// Transformations:

// ----------------

// 1. Compute constructs: acc.parallel, acc.serial, and acc.kernels are

//    replaced by acc.kernel_environment containing a single acc.compute_region.

//    For acc.parallel / acc.kernels, launch arguments (num_gangs, num_workers,

//    vector_length) become acc.par_width ops (each result is `index`) and are

//    passed as compute_region launch operands. Compute regions with

//    num_gangs(1), num_workers(1), and vector_length(1) and acc serial use a

//    single sequential acc.par_width launch operand.

//

// 2. acc.loop: Converted according to context and attributes:

//    - Unstructured: body wrapped in scf.execute_region.

//    - Sequential (serial region, seq clause, or compute region with

//    num_gangs(1), num_workers(1), and vector_length(1)):

//      scf.parallel with par_dims = sequential.

//    - Auto (in parallel/kernels): scf.for with collapse when

//    multi-dimensional.

//    - Orphan (not inside a compute construct): scf.for, no collapse.

//    - Independent (in parallel/kernels): scf.parallel with par_dims from

//      gang/worker/vector mapping (e.g. block_x).

//

//===----------------------------------------------------------------------===//


#include "mlir/Dialect/OpenACC/Transforms/Passes.h"


#include "mlir/Dialect/Arith/IR/Arith.h"

#include "mlir/Dialect/Func/IR/FuncOps.h"

#include "mlir/Dialect/OpenACC/OpenACC.h"

#include "mlir/Dialect/OpenACC/OpenACCParMapping.h"

#include "mlir/Dialect/OpenACC/OpenACCUtils.h"

#include "mlir/Dialect/OpenACC/OpenACCUtilsCG.h"

#include "mlir/Dialect/OpenACC/OpenACCUtilsLoop.h"

#include "mlir/Dialect/Utils/StaticValueUtils.h"

#include "mlir/IR/IRMapping.h"

#include "mlir/IR/Matchers.h"

#include "mlir/Interfaces/FunctionInterfaces.h"

#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

#include "mlir/Transforms/RegionUtils.h"

#include "llvm/ADT/STLExtras.h"


namespace mlir {

namespace acc {

#define GEN_PASS_DEF_ACCCOMPUTELOWERING

#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"

} // namespace acc

} // namespace mlir


#define DEBUG_TYPE "acc-compute-lowering"


using namespace mlir;

using namespace mlir::acc;


namespace {


//===----------------------------------------------------------------------===//

// Helper functions

//===----------------------------------------------------------------------===//


/// Strip index_cast operations from a value before checking for a constant.

static Value stripIndexCasts(Value val) {

  while (auto castOp = val.getDefiningOp<arith::IndexCastOp>())

    val = castOp.getIn();

  return val;

}


template <typename ComputeOpT>

static bool isGangWorkerVectorAllOne(ComputeOpT op) {

  auto numGangs = op.getNumGangsValues();

  if (numGangs.empty())

    return false;

  for (Value gangSize : numGangs) {

    if (!isConstantIntValue(stripIndexCasts(gangSize), 1))

      return false;

  }

  Value numWorkers = op.getNumWorkersValue();

  if (!numWorkers)

    return false;

  Value vectorLength = op.getVectorLengthValue();

  if (!vectorLength)

    return false;

  return isConstantIntValue(stripIndexCasts(numWorkers), 1) &&

         isConstantIntValue(stripIndexCasts(vectorLength), 1);

}


/// A compute construct is "effectively serial" when it specifies

/// num_gangs(1), num_workers(1), and vector_length(1). This is because

/// these are the only parallelism dimensions expressible from OpenACC spec

/// point-of-view and is consistent with how `serial` semantics are defined.

template <typename ComputeOpT>

static bool isEffectivelySerial(ComputeOpT op) {

  return isGangWorkerVectorAllOne(op);

}


static bool isOpInComputeRegion(Operation *op) {

  Region *region = op->getBlock()->getParent();

  return getEnclosingComputeOp(*region) != nullptr;

}


static bool isOpInSerialRegion(Operation *op) {

  if (auto parallelOp = op->getParentOfType<ParallelOp>())

    return isEffectivelySerial(parallelOp);

  if (auto kernelsOp = op->getParentOfType<KernelsOp>())

    return isEffectivelySerial(kernelsOp);

  if (op->getParentOfType<SerialOp>())

    return true;

  if (auto computeRegion = op->getParentOfType<ComputeRegionOp>())

    return computeRegion.isEffectivelySerial();

  if (auto funcOp = op->getParentOfType<FunctionOpInterface>()) {

    if (isSpecializedAccRoutine(funcOp)) {

      auto attr = funcOp->getAttrOfType<SpecializedRoutineAttr>(

          getSpecializedRoutineAttrName());

      if (attr && attr.getLevel().getValue() == ParLevel::seq)

        return true;

    }

  }

  return false;

}


static void setParDimsAttr(Operation *op, GPUParallelDimsAttr attr) {

  op->setAttr(GPUParallelDimsAttr::name, attr);

}


/// Clone defining ops of constant live-in values into `region`, rewrite uses

/// inside the region to the clones, and remove those values from

/// `liveInValues` so they are not threaded through `acc.compute_region` ins.

static void materializeConstantLiveInsIntoRegion(Region &region,

                                                 SetVector<Value> &liveInValues,

                                                 RewriterBase &rewriter) {

  SmallVector<Value> constantLiveIns;

  for (Value v : liveInValues) {

    Operation *defOp = v.getDefiningOp();

    if (defOp && matchPattern(defOp, m_Constant())) {

      // As per the definition of ConstantLike trait, constants must have a

      // single result.

      assert(defOp->getNumResults() == 1 &&

             "constants must have a single result");

      constantLiveIns.push_back(v);

    }

  }

  if (constantLiveIns.empty())

    return;


  OpBuilder::InsertionGuard guard(rewriter);

  rewriter.setInsertionPointToStart(&region.front());


  for (Value v : constantLiveIns) {

    Value newV = rewriter.clone(*v.getDefiningOp())->getResult(0);

    replaceAllUsesInRegionWith(v, newV, region);

    liveInValues.remove(v);

  }

}


/// Insert a parallel dimension into the list, maintaining order by

/// GPUParallelDimAttr::getOrder (descending).

static void insertParDim(SmallVectorImpl<GPUParallelDimAttr> &parDims,

                         GPUParallelDimAttr parDim) {

  GPUParallelDimAttr *lb = llvm::lower_bound(

      parDims, parDim,

      [](const GPUParallelDimAttr &a, const GPUParallelDimAttr &b) {

        return a.getOrder() > b.getOrder();

      });

  if (lb == parDims.end() || *lb != parDim)

    parDims.insert(lb, parDim);

}


/// Return the device type from which gang/worker/vector clauses should be read.

/// If the requested device type has any such clauses, use that exclusively;

/// otherwise fall back to the default (DeviceType::None).

static DeviceType getGangWorkerVectorDeviceType(LoopOp loopOp,

                                                DeviceType deviceType) {

  if (deviceType != DeviceType::None &&

      loopOp.hasAnyGangWorkerVector(deviceType))

    return deviceType;

  return DeviceType::None;

}


template <typename ComputeConstructT>

static DeviceType getParDimsDeviceType(ComputeConstructT computeOp,

                                       DeviceType deviceType) {

  if (deviceType != DeviceType::None &&

      computeOp.hasAnyGangWorkerVector(deviceType))

    return deviceType;

  return DeviceType::None;

}


/// Map loop parallelism clauses (gang/worker/vector) to GPU parallel

/// dimensions using the given mapping policy.

static SmallVector<GPUParallelDimAttr>

getParallelDimensions(LoopOp loopOp, const ACCToGPUMappingPolicy &policy,

                      DeviceType deviceType) {

  deviceType = getGangWorkerVectorDeviceType(loopOp, deviceType);

  SmallVector<GPUParallelDimAttr> parDims;

  auto *ctx = loopOp->getContext();


  if (loopOp.hasVector(deviceType))

    insertParDim(parDims, policy.vectorDim(ctx));

  if (loopOp.hasWorker(deviceType))

    insertParDim(parDims, policy.workerDim(ctx));

  if (auto gangDimValue = loopOp.getGangValue(GangArgType::Dim, deviceType)) {

    if (auto gangDimDefOp =

            gangDimValue.getDefiningOp<arith::ConstantIntOp>()) {

      auto gangLevel = getGangParLevel(gangDimDefOp.value());

      insertParDim(parDims, policy.gangDim(ctx, gangLevel));

    }

  } else if (loopOp.hasGang(deviceType)) {

    insertParDim(parDims, policy.gangDim(ctx, ParLevel::gang_dim1));

  }

  return parDims;

}


/// Build `acc.compute_region` launch operands: one sequential `acc.par_width`

/// for `acc.serial`, for `acc.parallel` / `acc.kernels` when every num_gangs

/// operand and num_workers / vector_length are the constant 1, and otherwise

/// `acc.par_width` from gang/worker/vector (device-type operands first, then

/// default DeviceType::None).

template <typename ComputeConstructT>

static SmallVector<Value>

assignKnownLaunchArgs(ComputeConstructT computeOp, DeviceType deviceType,

                      RewriterBase &rewriter,

                      const ACCToGPUMappingPolicy &policy) {

  auto *ctx = rewriter.getContext();

  auto loc = computeOp->getLoc();


  if constexpr (std::is_same_v<ComputeConstructT, SerialOp>) {

    return {ParWidthOp::create(rewriter, loc, Value(), policy.seqDim(ctx))};

  } else if constexpr (llvm::is_one_of<ComputeConstructT, ParallelOp,

                                       KernelsOp>::value) {

    if (isEffectivelySerial(computeOp))

      return {ParWidthOp::create(rewriter, loc, Value(), policy.seqDim(ctx))};


    deviceType = getParDimsDeviceType(computeOp, deviceType);


    SmallVector<Value> values;

    auto indexTy = rewriter.getIndexType();


    auto numGangs = computeOp.getNumGangsValues(deviceType);

    for (auto [gangDimIdx, gangSize] : llvm::enumerate(numGangs)) {

      auto gangLevel = getGangParLevel(gangDimIdx + 1);

      values.push_back(ParWidthOp::create(

          rewriter, loc,

          getValueOrCreateCastToIndexLike(rewriter, gangSize.getLoc(), indexTy,

                                          gangSize),

          policy.gangDim(ctx, gangLevel)));

    }


    Value numWorkers = computeOp.getNumWorkersValue(deviceType);

    if (numWorkers) {

      values.push_back(ParWidthOp::create(

          rewriter, loc,

          getValueOrCreateCastToIndexLike(rewriter, numWorkers.getLoc(),

                                          indexTy, numWorkers),

          policy.workerDim(ctx)));

    }


    Value vectorLength = computeOp.getVectorLengthValue(deviceType);

    if (vectorLength) {

      values.push_back(ParWidthOp::create(

          rewriter, loc,

          getValueOrCreateCastToIndexLike(rewriter, vectorLength.getLoc(),

                                          indexTy, vectorLength),

          policy.vectorDim(ctx)));

    }

    return values;

  } else {

    llvm_unreachable("assignKnownLaunchArgs: expected parallel, kernels, or "

                     "serial");

  }

}


//===----------------------------------------------------------------------===//

// Loop conversion pattern

//===----------------------------------------------------------------------===//


class ACCLoopConversion : public OpRewritePattern<LoopOp> {

public:

  ACCLoopConversion(MLIRContext *ctx, const ACCToGPUMappingPolicy &policy,

                    DeviceType deviceType)

      : OpRewritePattern<LoopOp>(ctx), policy(policy), deviceType(deviceType) {}


  LogicalResult matchAndRewrite(LoopOp loopOp,

                                PatternRewriter &rewriter) const override {

    if (loopOp.getUnstructured()) {

      auto executeRegion =

          convertUnstructuredACCLoopToSCFExecuteRegion(loopOp, rewriter);

      if (!executeRegion)

        return failure();

      rewriter.replaceOp(loopOp, executeRegion);

      return success();

    }


    LoopParMode parMode = loopOp.getDefaultOrDeviceTypeParallelism(deviceType);


    if (parMode == LoopParMode::loop_seq || isOpInSerialRegion(loopOp)) {

      // Although it might seem unintuitive, scf.parallel is used here because

      // the parallelism of the loop is already predetermined (as sequential).

      // scf.for will become a candidate for auto-parallelization analysis.

      auto parallelOp = convertACCLoopToSCFParallel(loopOp, rewriter);

      if (!parallelOp)

        return failure();

      setParDimsAttr(parallelOp,

                     GPUParallelDimsAttr::seq(loopOp->getContext()));

      rewriter.replaceOp(loopOp, parallelOp);

    } else if (parMode == LoopParMode::loop_auto) {

      // All loops in serial regions should have already been handled.

      assert(!isOpInSerialRegion(loopOp) &&

             "Expected loop to be in non-serial region");

      // Mark as scf.for to allow auto-parallelization analysis later.

      auto forOp =

          convertACCLoopToSCFFor(loopOp, rewriter, /*enableCollapse=*/true);

      if (!forOp)

        return failure();

      SmallVector<GPUParallelDimAttr> parDims =

          getParallelDimensions(loopOp, policy, deviceType);

      if (!parDims.empty()) {

        auto parDimsAttr =

            GPUParallelDimsAttr::get(loopOp->getContext(), parDims);

        setParDimsAttr(forOp, parDimsAttr);

      }

      rewriter.replaceOp(loopOp, forOp);

    } else if (!isOpInComputeRegion(loopOp) &&

               !isSpecializedAccRoutine(

                   loopOp->getParentOfType<FunctionOpInterface>())) {

      // This loop is an orphan `acc loop` but it is not in any sort

      // of compute region. Thus it is just a sequential non-accelerator loop.

      auto forOp =

          convertACCLoopToSCFFor(loopOp, rewriter, /*enableCollapse=*/false);

      if (!forOp)

        return failure();

      rewriter.replaceOp(loopOp, forOp);

    } else {

      assert(parMode == LoopParMode::loop_independent &&

             "Expected loop to be independent");

      auto parallelOp = convertACCLoopToSCFParallel(loopOp, rewriter);

      if (!parallelOp)

        return failure();


      SmallVector<GPUParallelDimAttr> parDims =

          getParallelDimensions(loopOp, policy, deviceType);

      if (!parDims.empty()) {

        auto parDimsAttr =

            GPUParallelDimsAttr::get(loopOp->getContext(), parDims);

        setParDimsAttr(parallelOp, parDimsAttr);

      }


      rewriter.replaceOp(loopOp, parallelOp);

    }

    return success();

  }


private:

  const ACCToGPUMappingPolicy &policy;

  DeviceType deviceType;

};


//===----------------------------------------------------------------------===//

// Compute construct conversion pattern

//===----------------------------------------------------------------------===//


template <typename ComputeConstructT>

class ComputeOpConversion : public OpRewritePattern<ComputeConstructT> {

public:

  ComputeOpConversion(MLIRContext *ctx, const ACCToGPUMappingPolicy &policy,

                      DeviceType deviceType)

      : OpRewritePattern<ComputeConstructT>(ctx), policy(policy),

        deviceType(deviceType) {}


  LogicalResult matchAndRewrite(ComputeConstructT computeOp,

                                PatternRewriter &rewriter) const override {

    rewriter.setInsertionPoint(computeOp);

    auto kernelEnv =

        KernelEnvironmentOp::createAndPopulate(computeOp, deviceType, rewriter);

    auto launchArgs =

        assignKnownLaunchArgs(computeOp, deviceType, rewriter, policy);

    Region &region = computeOp.getRegion();

    SetVector<Value> liveInValues;

    getUsedValuesDefinedAbove(region, region, liveInValues);

    materializeConstantLiveInsIntoRegion(region, liveInValues, rewriter);

    IRMapping mapping;

    auto computeRegion = buildComputeRegion(

        computeOp->getLoc(), launchArgs, liveInValues.getArrayRef(),

        ComputeConstructT::getOperationName(), region, rewriter, mapping);

    if (!computeRegion) {

      rewriter.eraseOp(kernelEnv);

      return failure();

    }

    rewriter.eraseOp(computeOp);

    return success();

  }


private:

  const ACCToGPUMappingPolicy &policy;

  DeviceType deviceType;

};


//===----------------------------------------------------------------------===//

// Pass implementation

//===----------------------------------------------------------------------===//


class ACCComputeLowering

    : public acc::impl::ACCComputeLoweringBase<ACCComputeLowering> {

public:

  using ACCComputeLoweringBase::ACCComputeLoweringBase;


  void runOnOperation() override {

    auto op = getOperation();

    auto *context = op.getContext();


    DefaultACCToGPUMappingPolicy policy;


    // Part 1: Convert acc.loop to scf.parallel/scf.for while the parent

    // compute construct is still present (needed to determine conversion

    // strategy).

    RewritePatternSet loopPatterns(context);

    loopPatterns.insert<ACCLoopConversion>(context, policy, deviceType);

    if (failed(applyPatternsGreedily(op, std::move(loopPatterns))))

      return signalPassFailure();


    // Part 2: Convert acc.parallel, acc.kernels, and acc.serial to

    // acc.kernel_environment { acc.compute_region { ... } }.

    RewritePatternSet computePatterns(context);

    computePatterns

        .insert<ComputeOpConversion<ParallelOp>, ComputeOpConversion<KernelsOp>,

                ComputeOpConversion<SerialOp>>(context, policy, deviceType);

    if (failed(applyPatternsGreedily(op, std::move(computePatterns))))

      return signalPassFailure();

  }

};


} // namespace

success
return success()

Passes.h

FuncOps.h

FunctionInterfaces.h

GreedyPatternRewriteDriver.h

IRMapping.h

b
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
Definition LinalgTransformOps.cpp:2136

Matchers.h

OpenACCParMapping.h

OpenACCUtilsCG.h

OpenACCUtilsLoop.h

OpenACCUtils.h

RegionUtils.h

StaticValueUtils.h

llvm::SmallVectorImpl
Definition LLVM.h:66

llvm::SmallVector
Definition LLVM.h:64

mlir::Block::getParent
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition Block.cpp:27

mlir::Builder::getContext
MLIRContext * getContext() const
Definition Builders.h:56

mlir::Builder::getIndexType
IndexType getIndexType()
Definition Builders.cpp:55

mlir::IRMapping
This is a utility class for mapping one set of IR entities to another.
Definition IRMapping.h:26

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63

mlir::OpBuilder::InsertionGuard
RAII guard to reset the insertion point of the builder when destroyed.
Definition Builders.h:350

mlir::OpBuilder::clone
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition Builders.cpp:567

mlir::OpBuilder::setInsertionPointToStart
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition Builders.h:433

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition Builders.h:400

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition Operation.h:87

mlir::Operation::getBlock
Block * getBlock()
Returns the operation block that contains this operation.
Definition Operation.h:230

mlir::Operation::getResult
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition Operation.h:432

mlir::Operation::getParentOfType
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:255

mlir::Operation::getNumResults
unsigned getNumResults()
Return the number of results held by this operation.
Definition Operation.h:429

mlir::PatternRewriter
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition PatternMatch.h:799

mlir::Region
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition Region.h:26

mlir::Region::front
Block & front()
Definition Region.h:65

mlir::RewritePatternSet
Definition PatternMatch.h:822

mlir::RewritePatternSet::insert
RewritePatternSet & insert(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
Definition PatternMatch.h:946

mlir::RewriterBase
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition PatternMatch.h:368

mlir::RewriterBase::replaceOp
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
Definition PatternMatch.cpp:127

mlir::RewriterBase::eraseOp
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Definition PatternMatch.cpp:155

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96

mlir::Value::getLoc
Location getLoc() const
Return the location of this value.
Definition Value.cpp:24

mlir::Value::getDefiningOp
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition Value.cpp:18

mlir::acc::ACCParMappingPolicy::seqDim
ParDimAttrT seqDim(MLIRContext *ctx) const
Definition OpenACCParMapping.h:88

mlir::acc::ACCParMappingPolicy::vectorDim
ParDimAttrT vectorDim(MLIRContext *ctx) const
Definition OpenACCParMapping.h:85

mlir::acc::ACCParMappingPolicy::workerDim
ParDimAttrT workerDim(MLIRContext *ctx) const
Definition OpenACCParMapping.h:82

mlir::acc::ACCParMappingPolicy::gangDim
ParDimAttrT gangDim(MLIRContext *ctx, ParLevel level) const
Convenience methods for specific parallelism levels.
Definition OpenACCParMapping.h:76

mlir::acc::DefaultACCToGPUMappingPolicy
Default policy that provides the standard GPU mapping: gang(dim:1) -> BlockX (gridDim....
Definition OpenACCParMapping.h:119

mlir::arith::ConstantIntOp
Specialization of arith.constant op that returns an integer value.
Definition Arith.h:55

Arith.h

OpenACC.h

mlir::acc
Definition OpenACCSupport.h:65

mlir::acc::getGangParLevel
ParLevel getGangParLevel(int64_t gangDimValue)
Convert a gang dimension value (1, 2, or 3) to the corresponding ParLevel.
Definition OpenACCParMapping.h:33

mlir::acc::getSpecializedRoutineAttrName
static constexpr StringLiteral getSpecializedRoutineAttrName()
Definition OpenACC.h:189

mlir::acc::buildComputeRegion
ComputeRegionOp buildComputeRegion(Location loc, ValueRange launchArgs, ValueRange inputArgs, llvm::StringRef origin, Region &regionToClone, RewriterBase &rewriter, IRMapping &mapping, ValueRange output={}, FlatSymbolRefAttr kernelFuncName={}, FlatSymbolRefAttr kernelModuleName={}, Value stream={}, ValueRange inputArgsToMap={})
Build an acc.compute_region operation by cloning a source region.
Definition OpenACCUtilsCG.cpp:58

mlir::acc::isSpecializedAccRoutine
bool isSpecializedAccRoutine(mlir::Operation *op)
Used to check whether this is a specialized accelerator version of acc routine function.
Definition OpenACC.h:201

mlir::acc::convertACCLoopToSCFParallel
scf::ParallelOp convertACCLoopToSCFParallel(LoopOp loopOp, RewriterBase &rewriter)
Convert acc.loop to scf.parallel.
Definition OpenACCUtilsLoop.cpp:259

mlir::acc::getEnclosingComputeOp
mlir::Operation * getEnclosingComputeOp(mlir::Region &region)
Used to obtain the enclosing compute construct operation that contains the provided region.
Definition OpenACCUtils.cpp:23

mlir::acc::convertUnstructuredACCLoopToSCFExecuteRegion
scf::ExecuteRegionOp convertUnstructuredACCLoopToSCFExecuteRegion(LoopOp loopOp, RewriterBase &rewriter)
Convert an unstructured acc.loop to scf.execute_region.
Definition OpenACCUtilsLoop.cpp:323

mlir::acc::convertACCLoopToSCFFor
scf::ForOp convertACCLoopToSCFFor(LoopOp loopOp, RewriterBase &rewriter, bool enableCollapse)
Convert a structured acc.loop to scf.for.
Definition OpenACCUtilsLoop.cpp:183

mlir::acc::ACCToGPUMappingPolicy
ACCParMappingPolicy< mlir::acc::GPUParallelDimAttr > ACCToGPUMappingPolicy
Type alias for the GPU-specific mapping policy.
Definition OpenACCParMapping.h:158

mlir
Include the generated interface declarations.
Definition ABIRewriteContext.h:29

mlir::matchPattern
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
Definition Matchers.h:490

mlir::isConstantIntValue
bool isConstantIntValue(OpFoldResult ofr, int64_t value)
Return true if ofr is constant integer equal to value.
Definition StaticValueUtils.cpp:168

mlir::replaceAllUsesInRegionWith
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition RegionUtils.cpp:36

mlir::applyPatternsGreedily
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
Definition GreedyPatternRewriteDriver.cpp:934

mlir::SetVector
llvm::SetVector< T, Vector, Set, N > SetVector
Definition LLVM.h:125

mlir::getValueOrCreateCastToIndexLike
Value getValueOrCreateCastToIndexLike(OpBuilder &b, Location loc, Type targetType, Value value)
Create a cast from an index-like value (index or integer) to another index-like value.
Definition Utils.cpp:122

mlir::getUsedValuesDefinedAbove
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Definition RegionUtils.cpp:71

mlir::m_Constant
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
Definition Matchers.h:369

mlir::OpRewritePattern
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition PatternMatch.h:314