doxygen/ShardToMPI_8cpp_source.html

//===- ShardToMPI.cpp - Shard to MPI  dialect conversion -----------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This file implements a translation of Shard communication ops to MPI ops.

//

//===----------------------------------------------------------------------===//


#include "mlir/Conversion/ShardToMPI/ShardToMPI.h"


#include "mlir/Dialect/Affine/IR/AffineOps.h"

#include "mlir/Dialect/Arith/IR/Arith.h"

#include "mlir/Dialect/Bufferization/IR/Bufferization.h"

#include "mlir/Dialect/Func/IR/FuncOps.h"

#include "mlir/Dialect/Func/Transforms/FuncConversions.h"

#include "mlir/Dialect/Linalg/IR/Linalg.h"

#include "mlir/Dialect/MPI/IR/MPI.h"

#include "mlir/Dialect/MemRef/IR/MemRef.h"

#include "mlir/Dialect/SCF/IR/SCF.h"

#include "mlir/Dialect/Shard/IR/ShardDialect.h"

#include "mlir/Dialect/Shard/IR/ShardOps.h"

#include "mlir/Dialect/Shard/Transforms/Simplifications.h"

#include "mlir/Dialect/Shard/Transforms/Transforms.h"

#include "mlir/Dialect/Tensor/IR/Tensor.h"

#include "mlir/Dialect/Utils/StaticValueUtils.h"

#include "mlir/IR/Builders.h"

#include "mlir/IR/BuiltinAttributes.h"

#include "mlir/IR/BuiltinTypes.h"

#include "mlir/IR/PatternMatch.h"

#include "mlir/IR/SymbolTable.h"

#include "mlir/Transforms/DialectConversion.h"

#include "mlir/Transforms/GreedyPatternRewriteDriver.h"


#define DEBUG_TYPE "shard-to-mpi"


namespace mlir {

#define GEN_PASS_DEF_CONVERTSHARDTOMPIPASS

#include "mlir/Conversion/Passes.h.inc"

} // namespace mlir


using namespace mlir;

using namespace shard;


namespace {

/// Converts a vector of OpFoldResults (ints) into vector of Values of the

/// provided type.

static SmallVector<Value> getMixedAsValues(OpBuilder b, const Location &loc,

                                           llvm::ArrayRef<int64_t> statics,

                                           ValueRange dynamics,

                                           Type type = Type()) {

  SmallVector<Value> values;

  auto dyn = dynamics.begin();

  Type i64 = b.getI64Type();

  if (!type)

    type = i64;

  assert((i64 == type || b.getIndexType() == type) &&

         "expected an i64 or an intex type");

  for (auto s : statics) {

    if (s == ShapedType::kDynamic) {

      values.emplace_back(*(dyn++));

    } else {

      TypedAttr val = type == i64 ? b.getI64IntegerAttr(s) : b.getIndexAttr(s);

      values.emplace_back(arith::ConstantOp::create(b, loc, type, val));

    }

  }

  return values;

}


/// Create operations converting a linear index to a multi-dimensional index.

static SmallVector<Value> linearToMultiIndex(Location loc, OpBuilder b,

                                             Value linearIndex,

                                             ValueRange dimensions) {

  int n = dimensions.size();

  SmallVector<Value> multiIndex(n);


  for (int i = n - 1; i >= 0; --i) {

    multiIndex[i] = arith::RemSIOp::create(b, loc, linearIndex, dimensions[i]);

    if (i > 0)

      linearIndex = arith::DivSIOp::create(b, loc, linearIndex, dimensions[i]);

  }


  return multiIndex;

}


/// Create operations converting a multi-dimensional index to a linear index.

Value multiToLinearIndex(Location loc, OpBuilder b, ValueRange multiIndex,

                         ValueRange dimensions) {


  Value linearIndex = arith::ConstantIndexOp::create(b, loc, 0);

  Value stride = arith::ConstantIndexOp::create(b, loc, 1);


  for (int i = multiIndex.size() - 1; i >= 0; --i) {

    Value off = arith::MulIOp::create(b, loc, multiIndex[i], stride);

    linearIndex = arith::AddIOp::create(b, loc, linearIndex, off);

    stride = arith::MulIOp::create(b, loc, stride, dimensions[i]);

  }


  return linearIndex;

}


/// Replace GetShardingOp with related/dependent ShardingOp.

struct ConvertGetShardingOp : public OpConversionPattern<GetShardingOp> {

  using OpConversionPattern::OpConversionPattern;


  LogicalResult

  matchAndRewrite(GetShardingOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    auto shardOp = adaptor.getSource().getDefiningOp<ShardOp>();

    if (!shardOp)

      return failure();

    auto shardingOp = shardOp.getSharding().getDefiningOp<ShardingOp>();

    if (!shardingOp)

      return failure();


    rewriter.replaceOp(op, shardingOp.getResult());

    return success();

  }

};


/// Convert a sharding op to a tuple of tensors of its components

///   (SplitAxes, HaloSizes, ShardedDimsOffsets)

/// as defined by type converter.

struct ConvertShardingOp : public OpConversionPattern<ShardingOp> {

  using OpConversionPattern::OpConversionPattern;


  LogicalResult

  matchAndRewrite(ShardingOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    auto splitAxes = op.getSplitAxes().getAxes();

    int64_t maxNAxes = 0;

    for (auto axes : splitAxes)

      maxNAxes = std::max<int64_t>(maxNAxes, axes.size());


    // To hold the split axes, create empty 2d tensor with shape

    // {splitAxes.size(), max-size-of-split-groups}.

    // Set trailing elements for smaller split-groups to -1.

    Location loc = op.getLoc();

    auto i16 = rewriter.getI16Type();

    auto i64 = rewriter.getI64Type();

    std::array<int64_t, 2> shape = {static_cast<int64_t>(splitAxes.size()),

                                    maxNAxes};

    Value resSplitAxes = tensor::EmptyOp::create(rewriter, loc, shape, i16);

    auto attr = IntegerAttr::get(i16, -1);

    Value fillValue = arith::ConstantOp::create(rewriter, loc, i16, attr);

    resSplitAxes =

        linalg::FillOp::create(rewriter, loc, fillValue, resSplitAxes)

            .getResult(0);


    // explicitly write values into tensor row by row

    std::array<int64_t, 2> strides = {1, 1};

    int64_t nSplits = 0;

    ValueRange empty = {};

    for (auto [i, axes] : llvm::enumerate(splitAxes)) {

      int64_t size = axes.size();

      if (size > 0)

        ++nSplits;

      std::array<int64_t, 2> offs = {(int64_t)i, 0};

      std::array<int64_t, 2> sizes = {1, size};

      auto tensorType = RankedTensorType::get({size}, i16);

      auto attrs = DenseIntElementsAttr::get(tensorType, axes.asArrayRef());

      auto vals = arith::ConstantOp::create(rewriter, loc, tensorType, attrs);

      resSplitAxes = tensor::InsertSliceOp::create(rewriter, loc, vals,

                                                   resSplitAxes, empty, empty,

                                                   empty, offs, sizes, strides);

    }


    // To hold halos sizes, create 2d Tensor with shape {nSplits, 2}.

    // Store the halo sizes in the tensor.

    SmallVector<Value> haloSizes =

        getMixedAsValues(rewriter, loc, adaptor.getStaticHaloSizes(),

                         adaptor.getDynamicHaloSizes());

    auto type = RankedTensorType::get({nSplits, 2}, i64);

    Value resHaloSizes =

        haloSizes.empty()

            ? tensor::EmptyOp::create(rewriter, loc,

                                      std::array<int64_t, 2>{0, 0}, i64)

                  .getResult()

            : tensor::FromElementsOp::create(rewriter, loc, type, haloSizes)

                  .getResult();


    // To hold sharded dims offsets, create Tensor with shape {nSplits,

    // maxSplitSize+1}. Store the offsets in the tensor but set trailing

    // elements for smaller split-groups to -1. Computing the max size of the

    // split groups needs using collectiveProcessGroupSize (which needs the

    // GridOp)

    Value resOffsets;

    if (adaptor.getStaticShardedDimsOffsets().empty()) {

      resOffsets = tensor::EmptyOp::create(rewriter, loc,

                                           std::array<int64_t, 2>{0, 0}, i64);

    } else {

      SymbolTableCollection symbolTableCollection;

      auto gridOp = getGrid(op, symbolTableCollection);

      int64_t maxSplitSize = 0;

      for (auto axes : splitAxes) {

        int64_t splitSize =

            collectiveProcessGroupSize(axes.asArrayRef(), gridOp.getShape());

        assert(splitSize != ShapedType::kDynamic);

        maxSplitSize = std::max<int64_t>(maxSplitSize, splitSize);

      }

      assert(maxSplitSize);

      ++maxSplitSize; // add one for the total size


      resOffsets = tensor::EmptyOp::create(

          rewriter, loc, std::array<int64_t, 2>{nSplits, maxSplitSize}, i64);

      Value zero = arith::ConstantOp::create(

          rewriter, loc, i64, rewriter.getI64IntegerAttr(ShapedType::kDynamic));

      resOffsets =

          linalg::FillOp::create(rewriter, loc, zero, resOffsets).getResult(0);

      SmallVector<Value> offsets =

          getMixedAsValues(rewriter, loc, adaptor.getStaticShardedDimsOffsets(),

                           adaptor.getDynamicShardedDimsOffsets());

      int64_t curr = 0;

      for (auto [i, axes] : llvm::enumerate(splitAxes)) {

        int64_t splitSize =

            collectiveProcessGroupSize(axes.asArrayRef(), gridOp.getShape());

        assert(splitSize != ShapedType::kDynamic && splitSize < maxSplitSize);

        ++splitSize; // add one for the total size

        ArrayRef<Value> values(&offsets[curr], splitSize);

        Value vals = tensor::FromElementsOp::create(rewriter, loc, values);

        std::array<int64_t, 2> offs = {static_cast<int64_t>(i), 0};

        std::array<int64_t, 2> sizes = {1, splitSize};

        resOffsets = tensor::InsertSliceOp::create(rewriter, loc, vals,

                                                   resOffsets, empty, empty,

                                                   empty, offs, sizes, strides);

        curr += splitSize;

      }

    }


    // return a tuple of tensors as defined by type converter

    SmallVector<Type> resTypes;

    if (failed(getTypeConverter()->convertType(op.getResult().getType(),

                                               resTypes)))

      return failure();


    resSplitAxes =

        tensor::CastOp::create(rewriter, loc, resTypes[0], resSplitAxes);

    resHaloSizes =

        tensor::CastOp::create(rewriter, loc, resTypes[1], resHaloSizes);

    resOffsets = tensor::CastOp::create(rewriter, loc, resTypes[2], resOffsets);


    rewriter.replaceOpWithNewOp<UnrealizedConversionCastOp>(

        op, TupleType::get(op.getContext(), resTypes),

        ValueRange{resSplitAxes, resHaloSizes, resOffsets});


    return success();

  }

};


struct ConvertProcessMultiIndexOp

    : public OpConversionPattern<ProcessMultiIndexOp> {

  using OpConversionPattern::OpConversionPattern;


  LogicalResult

  matchAndRewrite(ProcessMultiIndexOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {


    // Currently converts its linear index to a multi-dimensional index.


    SymbolTableCollection symbolTableCollection;

    Location loc = op.getLoc();

    auto gridOp = getGrid(op, symbolTableCollection);

    // For now we only support static grid shapes

    if (ShapedType::isDynamicShape(gridOp.getShape()))

      return failure();


    SmallVector<Value> dims;

    llvm::transform(

        gridOp.getShape(), std::back_inserter(dims), [&](int64_t i) {

          return arith::ConstantIndexOp::create(rewriter, loc, i).getResult();

        });

    Value rank = ProcessLinearIndexOp::create(rewriter, op.getLoc(), gridOp);

    auto mIdx = linearToMultiIndex(loc, rewriter, rank, dims);


    // optionally extract subset of grid axes

    auto axes = adaptor.getAxes();

    if (!axes.empty()) {

      SmallVector<Value> subIndex;

      for (auto axis : axes) {

        subIndex.emplace_back(mIdx[axis]);

      }

      mIdx = std::move(subIndex);

    }


    rewriter.replaceOp(op, mIdx);

    return success();

  }

};


class ConvertProcessLinearIndexOp

    : public OpConversionPattern<ProcessLinearIndexOp> {


public:

  using OpConversionPattern::OpConversionPattern;


  LogicalResult

  matchAndRewrite(ProcessLinearIndexOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    // Create mpi::CommRankOp

    Location loc = op.getLoc();

    auto *ctx = op.getContext();

    Value commWorld =

        mpi::CommWorldOp::create(rewriter, loc, mpi::CommType::get(ctx));

    auto rank = mpi::CommRankOp::create(

                    rewriter, loc,

                    TypeRange{mpi::RetvalType::get(ctx), rewriter.getI32Type()},

                    commWorld)

                    .getRank();

    rewriter.replaceOpWithNewOp<arith::IndexCastOp>(op, rewriter.getIndexType(),

                                                    rank);

    return success();

  }

};


struct ConvertNeighborsLinearIndicesOp

    : public OpConversionPattern<NeighborsLinearIndicesOp> {

  using OpConversionPattern::OpConversionPattern;


  LogicalResult

  matchAndRewrite(NeighborsLinearIndicesOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {


    // Computes the neighbors indices along a split axis by simply

    // adding/subtracting 1 to the current index in that dimension.

    // Assigns -1 if neighbor is out of bounds.


    auto axes = adaptor.getSplitAxes();

    // For now only single axis sharding is supported

    if (axes.size() != 1)

      return failure();


    Location loc = op.getLoc();

    SymbolTableCollection symbolTableCollection;

    auto gridOp = getGrid(op, symbolTableCollection);

    auto mIdx = adaptor.getDevice();

    auto orgIdx = mIdx[axes[0]];

    SmallVector<Value> dims;

    llvm::transform(

        gridOp.getShape(), std::back_inserter(dims), [&](int64_t i) {

          return arith::ConstantIndexOp::create(rewriter, loc, i).getResult();

        });

    Value dimSz = dims[axes[0]];

    Value one = arith::ConstantIndexOp::create(rewriter, loc, 1);

    Value minus1 = arith::ConstantIndexOp::create(rewriter, loc, -1);

    Value atBorder =

        arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::sle, orgIdx,

                              arith::ConstantIndexOp::create(rewriter, loc, 0));

    auto down = scf::IfOp::create(

        rewriter, loc, atBorder,

        [&](OpBuilder &builder, Location loc) {

          scf::YieldOp::create(builder, loc, minus1);

        },

        [&](OpBuilder &builder, Location loc) {

          SmallVector<Value> tmp = mIdx;

          tmp[axes[0]] =

              arith::SubIOp::create(rewriter, op.getLoc(), orgIdx, one)

                  .getResult();

          scf::YieldOp::create(builder, loc,

                               multiToLinearIndex(loc, rewriter, tmp, dims));

        });

    atBorder = arith::CmpIOp::create(

        rewriter, loc, arith::CmpIPredicate::sge, orgIdx,

        arith::SubIOp::create(rewriter, loc, dimSz, one).getResult());

    auto up = scf::IfOp::create(

        rewriter, loc, atBorder,

        [&](OpBuilder &builder, Location loc) {

          scf::YieldOp::create(builder, loc, minus1);

        },

        [&](OpBuilder &builder, Location loc) {

          SmallVector<Value> tmp = mIdx;

          tmp[axes[0]] =

              arith::AddIOp::create(rewriter, op.getLoc(), orgIdx, one);

          scf::YieldOp::create(builder, loc,

                               multiToLinearIndex(loc, rewriter, tmp, dims));

        });

    rewriter.replaceOp(op, ValueRange{down.getResult(0), up.getResult(0)});

    return success();

  }

};


struct ConvertShardShapeOp : public OpConversionPattern<ShardShapeOp> {

  using OpConversionPattern::OpConversionPattern;


  LogicalResult

  matchAndRewrite(ShardShapeOp op, OneToNOpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    auto sharding = op.getSharding().getDefiningOp<ShardingOp>();

    if (!sharding) {

      return op->emitError()

             << "Expected ShardingOp as defining op for sharding"

             << " but found " << adaptor.getSharding()[0].getDefiningOp();

    }


    // Compute the sharded shape by applying the sharding to the input shape.

    // If shardedDimsOffsets is not defined in the sharding, the shard shape is

    // computed by dividing the dimension size by the number of shards in that

    // dimension (which is given by the size of the grid axes provided in

    // split-axes). Odd elements get distributed to trailing shards. If a

    // shardedDimsOffsets is provided, the shard shape is computed by

    // subtracting the offset of the current shard from the offset of the next

    // shard.


    Location loc = op.getLoc();

    Type index = rewriter.getIndexType();


    // This is a 1:N conversion because the sharding op is a 1:3 conversion.

    // The operands in the adaptor are a vector<ValeRange>. For dims and device

    // we have a 1:1 conversion.

    // For simpler access fill a vector with the dynamic dims.

    SmallVector<Value> dynDims, dynDevice;

    for (auto dim : adaptor.getDimsDynamic()) {

      // type conversion should be 1:1 for ints

      dynDims.emplace_back(llvm::getSingleElement(dim));

    }

    // same for device

    for (auto device : adaptor.getDeviceDynamic()) {

      dynDevice.emplace_back(llvm::getSingleElement(device));

    }


    // To keep the code simple, convert dims/device to values when they are

    // attributes. Count on canonicalization to fold static values.

    SmallVector<Value> shape =

        getMixedAsValues(rewriter, loc, op.getDims(), dynDims, index);

    SmallVector<Value> multiIdx =

        getMixedAsValues(rewriter, loc, adaptor.getDevice(), dynDevice, index);


    // Get the GridOp, the grid shape is needed to compute the sharded shape.

    SymbolTableCollection symbolTableCollection;

    auto gridOp = getGrid(sharding, symbolTableCollection);

    // For now we only support static grid shapes

    if (ShapedType::isDynamicShape(gridOp.getShape()))

      return failure();


    auto splitAxes = sharding.getSplitAxes().getAxes();

    // shardedDimsOffsets are optional and might be Values (not attributes).

    // Also, the shardId might be dynamic which means the position in the

    // shardedDimsOffsets is not statically known. Create a tensor of the

    // shardedDimsOffsets and later extract the offsets for computing the

    // local shard-size.

    Value shardedDimsOffs;

    {

      SmallVector<Value> tmp = getMixedAsValues(

          rewriter, loc, sharding.getStaticShardedDimsOffsets(),

          sharding.getDynamicShardedDimsOffsets(), index);

      if (!tmp.empty())

        shardedDimsOffs = tensor::FromElementsOp::create(

            rewriter, loc, RankedTensorType::get({(int64_t)tmp.size()}, index),

            tmp);

    }


    // With static grid shape the sizes of the split axes are known.

    // Hence the start/pos for each split axes in shardDimsOffsets can be

    // computed statically.

    int64_t pos = 0;

    SmallVector<Value> shardShape;

    Value zero =

        arith::ConstantOp::create(rewriter, loc, rewriter.getZeroAttr(index));

    Value one =

        arith::ConstantOp::create(rewriter, loc, rewriter.getOneAttr(index));


    // Iterate over the dimensions of the tensor shape, get their split Axes,

    // and compute the sharded shape.

    for (auto [i, dim] : llvm::enumerate(shape)) {

      // Trailing dimensions might not be annotated.

      if (i < splitAxes.size() && !splitAxes[i].empty()) {

        auto axes = splitAxes[i];

        // The current dimension might not be sharded.

        // Create a value from the static position in shardDimsOffsets.

        Value posVal = arith::ConstantOp::create(rewriter, loc,

                                                 rewriter.getIndexAttr(pos));

        // Get the index of the local shard in the grid axis.

        Value idx = multiIdx[axes[0]];

        auto numShards =

            collectiveProcessGroupSize(axes.asArrayRef(), gridOp.getShape());

        if (shardedDimsOffs) {

          // If sharded dims offsets are provided, use them to compute the

          // sharded shape.

          if (axes.size() > 1) {

            return op->emitError() << "Only single axis sharding is "

                                   << "supported for each dimension.";

          }

          idx = arith::AddIOp::create(rewriter, loc, posVal, idx);

          // Compute size = shardedDimsOffs[idx+1] - shardedDimsOffs[idx].

          Value off =

              tensor::ExtractOp::create(rewriter, loc, shardedDimsOffs, idx);

          idx = arith::AddIOp::create(rewriter, loc, idx, one);

          Value nextOff =

              tensor::ExtractOp::create(rewriter, loc, shardedDimsOffs, idx);

          Value sz = arith::SubIOp::create(rewriter, loc, nextOff, off);

          shardShape.emplace_back(sz);

        } else {

          Value numShardsVal = arith::ConstantOp::create(

              rewriter, loc, rewriter.getIndexAttr(numShards));

          // Compute shard dim size by distributing odd elements to trailing

          // shards:

          // sz = dim / numShards

          //      + (idx >= (numShards - (dim % numShards)) ? 1 : 0)

          Value sz = arith::DivSIOp::create(rewriter, loc, dim, numShardsVal);

          Value sz1 = arith::RemSIOp::create(rewriter, loc, dim, numShardsVal);

          sz1 = arith::SubIOp::create(rewriter, loc, numShardsVal, sz1);

          auto cond = arith::CmpIOp::create(

              rewriter, loc, arith::CmpIPredicate::sge, idx, sz1);

          Value odd = arith::SelectOp::create(rewriter, loc, cond, one, zero);

          sz = arith::AddIOp::create(rewriter, loc, sz, odd);

          shardShape.emplace_back(sz);

        }

        pos += numShards + 1; // add one for the total size.

      } // else no sharding if split axis is empty or no split axis

      // If no size was added -> no sharding in this dimension.

      if (shardShape.size() <= i)

        shardShape.emplace_back(dim);

    }

    assert(shardShape.size() == shape.size());

    rewriter.replaceOp(op, shardShape);

    return success();

  }

};


static mpi::MPI_ReductionOpEnumAttr getMPIReductionOp(ReductionKindAttr kind) {

  auto *ctx = kind.getContext();

  auto getReductionOp = [ctx](mpi::MPI_ReductionOpEnum redOp) {

    return mpi::MPI_ReductionOpEnumAttr::get(ctx, redOp);

  };


  switch (kind.getValue()) {

  case ReductionKind::Sum:

    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_SUM);

  case ReductionKind::Product:

    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_PROD);

  case ReductionKind::Min:

    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_MIN);

  case ReductionKind::Max:

    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_MAX);

  case ReductionKind::BitwiseAnd:

    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_BAND);

  case ReductionKind::BitwiseOr:

    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_BOR);

  case ReductionKind::BitwiseXor:

    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_BXOR);

  default:

    llvm_unreachable("Unknown/unsupported reduction kind");

  }

}


struct ConvertAllReduceOp : public OpConversionPattern<AllReduceOp> {

  using OpConversionPattern::OpConversionPattern;


  LogicalResult

  matchAndRewrite(AllReduceOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    SymbolTableCollection symbolTableCollection;

    auto grid = adaptor.getGrid();

    mlir::shard::GridOp gridOp = getGrid(op, symbolTableCollection);

    if (!gridOp)

      return op->emitError() << "No grid found for AllReduceOp";

    if (ShapedType::isDynamicShape(gridOp.getShape()))

      return op->emitError()

             << "Dynamic grid shape not supported in AllReduceOp";


    ImplicitLocOpBuilder iBuilder(op.getLoc(), rewriter);

    Value input = adaptor.getInput();

    auto inputShape = cast<ShapedType>(input.getType()).getShape();


    // If the source is a memref, cast it to a tensor.

    if (isa<RankedTensorType>(input.getType())) {

      auto memrefType = MemRefType::get(

          inputShape, cast<ShapedType>(input.getType()).getElementType());

      input = bufferization::ToBufferOp::create(iBuilder, memrefType, input);

    }

    MemRefType inType = cast<MemRefType>(input.getType());


    // Get the actual shape to allocate the buffer.

    SmallVector<OpFoldResult> shape(inType.getRank());

    for (auto i = 0; i < inType.getRank(); ++i) {

      auto s = inputShape[i];

      if (ShapedType::isDynamic(s))

        shape[i] = memref::DimOp::create(iBuilder, input, s).getResult();

      else

        shape[i] = iBuilder.getIndexAttr(s);

    }


    // Allocate buffer and copy input to buffer.

    Value buffer = memref::AllocOp::create(

        iBuilder, shape, cast<ShapedType>(op.getType()).getElementType());

    linalg::CopyOp::create(iBuilder, input, buffer);


    // Get an MPI_Comm_split for the AllReduce operation.

    // The color is the linear index of the process in the grid along the

    // non-reduced axes. The key is the linear index of the process in the grid

    // along the reduced axes.

    SmallVector<Type> indexResultTypes(gridOp.getShape().size(),

                                       iBuilder.getIndexType());

    SmallVector<Value> myMultiIndex =

        ProcessMultiIndexOp::create(iBuilder, indexResultTypes, grid)

            .getResult();

    Value zero = arith::ConstantIndexOp::create(iBuilder, 0);

    SmallVector<Value> multiKey(myMultiIndex.size(), zero);


    auto redAxes = adaptor.getGridAxes();

    for (auto axis : redAxes) {

      multiKey[axis] = myMultiIndex[axis];

      myMultiIndex[axis] = zero;

    }


    Value color =

        createProcessLinearIndex(grid, myMultiIndex, redAxes, iBuilder);

    color = arith::IndexCastOp::create(iBuilder, iBuilder.getI32Type(), color);

    Value key = createProcessLinearIndex(grid, multiKey, redAxes, iBuilder);

    key = arith::IndexCastOp::create(iBuilder, iBuilder.getI32Type(), key);


    // Finally split the communicator

    auto commType = mpi::CommType::get(op->getContext());

    Value commWorld = mpi::CommWorldOp::create(iBuilder, commType);

    auto comm =

        mpi::CommSplitOp::create(iBuilder, commType, commWorld, color, key)

            .getNewcomm();


    Value buffer1d = buffer;

    // Collapse shape to 1d if needed

    if (inType.getRank() > 1) {

      ReassociationIndices reassociation(inType.getRank());

      std::iota(reassociation.begin(), reassociation.end(), 0);

      buffer1d = memref::CollapseShapeOp::create(

          iBuilder, buffer, ArrayRef<ReassociationIndices>(reassociation));

    }


    // Create the MPI AllReduce operation.

    mpi::AllReduceOp::create(iBuilder, TypeRange(), buffer1d, buffer1d,

                             getMPIReductionOp(adaptor.getReductionAttr()),

                             comm);


    // If the destination is a memref, cast it to a tensor

    if (isa<RankedTensorType>(op.getType()))

      buffer = bufferization::ToTensorOp::create(iBuilder, op.getType(), buffer,

                                                 true);


    rewriter.replaceOp(op, buffer);

    return success();

  }

};


struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {

  using OpConversionPattern::OpConversionPattern;


  LogicalResult

  matchAndRewrite(UpdateHaloOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {


    // The input/output memref is assumed to be in C memory order.

    // Halos are exchanged as 2 blocks per dimension (one for each side: down

    // and up). For each haloed dimension `d`, the exchanged blocks are

    // expressed as multi-dimensional subviews. The subviews include potential

    // halos of higher dimensions `dh > d`, no halos for the lower dimensions

    // `dl < d` and for dimension `d` the currently exchanged halo only.

    // By iterating form higher to lower dimensions this also updates the halos

    // in the 'corners'.

    // memref.subview is used to read and write the halo data from and to the

    // local data. Because subviews and halos can have mixed dynamic and static

    // shapes, OpFoldResults are used whenever possible.


    auto haloSizes = getMixedValues(adaptor.getStaticHaloSizes(),

                                    adaptor.getHaloSizes(), rewriter);

    if (haloSizes.empty()) {

      // no halos -> nothing to do

      rewriter.replaceOp(op, adaptor.getDestination());

      return success();

    }


    SymbolTableCollection symbolTableCollection;

    Location loc = op.getLoc();


    // convert a OpFoldResult into a Value

    auto toValue = [&rewriter, &loc](OpFoldResult &v) -> Value {

      if (auto value = dyn_cast<Value>(v))

        return value;

      return arith::ConstantOp::create(

          rewriter, loc,

          rewriter.getIndexAttr(

              cast<IntegerAttr>(cast<Attribute>(v)).getInt()));

    };


    auto dest = adaptor.getDestination();

    auto dstShape = cast<ShapedType>(dest.getType()).getShape();

    Value array = dest;

    if (isa<RankedTensorType>(array.getType())) {

      // If the destination is a memref, we need to cast it to a tensor

      auto mmemrefType = MemRefType::get(

          dstShape, cast<ShapedType>(array.getType()).getElementType());

      array =

          bufferization::ToBufferOp::create(rewriter, loc, mmemrefType, array);

    }

    auto rank = cast<ShapedType>(array.getType()).getRank();

    auto opSplitAxes = adaptor.getSplitAxes().getAxes();

    auto grid = adaptor.getGrid();

    auto gridOp = getGrid(op, symbolTableCollection);

    // subviews need Index values

    for (auto &sz : haloSizes) {

      if (auto value = dyn_cast<Value>(sz))

        sz = arith::IndexCastOp::create(rewriter, loc, rewriter.getIndexType(),

                                        value)

                 .getResult();

    }


    // most of the offset/size/stride data is the same for all dims

    SmallVector<OpFoldResult> offsets(rank, rewriter.getIndexAttr(0));

    SmallVector<OpFoldResult> strides(rank, rewriter.getIndexAttr(1));

    SmallVector<OpFoldResult> shape(rank), dimSizes(rank);

    auto currHaloDim = -1; // halo sizes are provided for split dimensions only

    // we need the actual shape to compute offsets and sizes

    for (auto i = 0; i < rank; ++i) {

      auto s = dstShape[i];

      if (ShapedType::isDynamic(s))

        shape[i] = memref::DimOp::create(rewriter, loc, array, s).getResult();

      else

        shape[i] = rewriter.getIndexAttr(s);


      if ((size_t)i < opSplitAxes.size() && !opSplitAxes[i].empty()) {

        ++currHaloDim;

        // the offsets for lower dim sstarts after their down halo

        offsets[i] = haloSizes[currHaloDim * 2];


        // prepare shape and offsets of highest dim's halo exchange

        Value _haloSz = arith::AddIOp::create(

            rewriter, loc, toValue(haloSizes[currHaloDim * 2]),

            toValue(haloSizes[currHaloDim * 2 + 1]));

        // the halo shape of lower dims exlude the halos

        dimSizes[i] =

            arith::SubIOp::create(rewriter, loc, toValue(shape[i]), _haloSz)

                .getResult();

      } else {

        dimSizes[i] = shape[i];

      }

    }


    auto tagAttr = rewriter.getI32IntegerAttr(91); // we just pick something

    auto tag = arith::ConstantOp::create(rewriter, loc, tagAttr);

    auto zeroAttr = rewriter.getI32IntegerAttr(0); // for detecting v<0

    auto zero = arith::ConstantOp::create(rewriter, loc, zeroAttr);


    SmallVector<Type> indexResultTypes(gridOp.getShape().size(),

                                       rewriter.getIndexType());

    auto myMultiIndex =

        ProcessMultiIndexOp::create(rewriter, loc, indexResultTypes, grid)

            .getResult();

    // traverse all split axes from high to low dim

    for (ssize_t dim = opSplitAxes.size() - 1; dim >= 0; --dim) {

      auto splitAxes = opSplitAxes[dim];

      if (splitAxes.empty())

        continue;

      assert(currHaloDim >= 0 && (size_t)currHaloDim < haloSizes.size() / 2);

      // Get the linearized ids of the neighbors (down and up) for the

      // given split

      auto tmp = NeighborsLinearIndicesOp::create(rewriter, loc, grid,

                                                  myMultiIndex, splitAxes)

                     .getResults();

      // MPI operates on i32...

      Value neighbourIDs[2] = {

          arith::IndexCastOp::create(rewriter, loc, rewriter.getI32Type(),

                                     tmp[0]),

          arith::IndexCastOp::create(rewriter, loc, rewriter.getI32Type(),

                                     tmp[1])};


      auto lowerRecvOffset = rewriter.getIndexAttr(0);

      auto lowerSendOffset = toValue(haloSizes[currHaloDim * 2]);

      auto upperRecvOffset =

          arith::SubIOp::create(rewriter, loc, toValue(shape[dim]),

                                toValue(haloSizes[currHaloDim * 2 + 1]));

      auto upperSendOffset = arith::SubIOp::create(

          rewriter, loc, upperRecvOffset, toValue(haloSizes[currHaloDim * 2]));


      Value commWorld = mpi::CommWorldOp::create(

          rewriter, loc, mpi::CommType::get(op->getContext()));


      // Make sure we send/recv in a way that does not lead to a dead-lock.

      // The current approach is by far not optimal, this should be at least

      // be a red-black pattern or using MPI_sendrecv.

      // Also, buffers should be re-used.

      // Still using temporary contiguous buffers for MPI communication...

      // Still yielding a "serialized" communication pattern...

      auto genSendRecv = [&](bool upperHalo) {

        auto orgOffset = offsets[dim];

        dimSizes[dim] = upperHalo ? haloSizes[currHaloDim * 2 + 1]

                                  : haloSizes[currHaloDim * 2];

        // Check if we need to send and/or receive

        // Processes on the grid borders have only one neighbor

        auto to = upperHalo ? neighbourIDs[0] : neighbourIDs[1];

        auto from = upperHalo ? neighbourIDs[1] : neighbourIDs[0];

        auto hasFrom = arith::CmpIOp::create(

            rewriter, loc, arith::CmpIPredicate::sge, from, zero);

        auto hasTo = arith::CmpIOp::create(rewriter, loc,

                                           arith::CmpIPredicate::sge, to, zero);

        auto buffer = memref::AllocOp::create(

            rewriter, loc, dimSizes,

            cast<ShapedType>(array.getType()).getElementType());

        // if has neighbor: copy halo data from array to buffer and send

        scf::IfOp::create(

            rewriter, loc, hasTo, [&](OpBuilder &builder, Location loc) {

              offsets[dim] = upperHalo ? OpFoldResult(lowerSendOffset)

                                       : OpFoldResult(upperSendOffset);

              auto subview = memref::SubViewOp::create(

                  builder, loc, array, offsets, dimSizes, strides);

              memref::CopyOp::create(builder, loc, subview, buffer);

              mpi::SendOp::create(builder, loc, TypeRange{}, buffer, tag, to,

                                  commWorld);

              scf::YieldOp::create(builder, loc);

            });

        // if has neighbor: receive halo data into buffer and copy to array

        scf::IfOp::create(

            rewriter, loc, hasFrom, [&](OpBuilder &builder, Location loc) {

              offsets[dim] = upperHalo ? OpFoldResult(upperRecvOffset)

                                       : OpFoldResult(lowerRecvOffset);

              mpi::RecvOp::create(builder, loc, TypeRange{}, buffer, tag, from,

                                  commWorld);

              auto subview = memref::SubViewOp::create(

                  builder, loc, array, offsets, dimSizes, strides);

              memref::CopyOp::create(builder, loc, buffer, subview);

              scf::YieldOp::create(builder, loc);

            });

        memref::DeallocOp::create(rewriter, loc, buffer);

        offsets[dim] = orgOffset;

      };


      auto doSendRecv = [&](int upOrDown) {

        OpFoldResult &v = haloSizes[currHaloDim * 2 + upOrDown];

        Value haloSz = dyn_cast<Value>(v);

        if (!haloSz)

          haloSz = arith::ConstantOp::create(

              rewriter, loc,

              rewriter.getI32IntegerAttr(

                  cast<IntegerAttr>(cast<Attribute>(v)).getInt()));

        auto hasSize = arith::CmpIOp::create(

            rewriter, loc, arith::CmpIPredicate::sgt, haloSz, zero);

        scf::IfOp::create(rewriter, loc, hasSize,

                          [&](OpBuilder &builder, Location loc) {

                            genSendRecv(upOrDown > 0);

                            scf::YieldOp::create(builder, loc);

                          });

      };


      doSendRecv(0);

      doSendRecv(1);


      // the shape for lower dims include higher dims' halos

      dimSizes[dim] = shape[dim];

      // -> the offset for higher dims is always 0

      offsets[dim] = rewriter.getIndexAttr(0);

      // on to next halo

      --currHaloDim;

    }


    if (isa<MemRefType>(op.getResult().getType())) {

      rewriter.replaceOp(op, array);

    } else {

      assert(isa<RankedTensorType>(op.getResult().getType()));

      rewriter.replaceOp(op, bufferization::ToTensorOp::create(

                                 rewriter, loc, op.getResult().getType(), array,

                                 /*restrict=*/true, /*writable=*/true));

    }

    return success();

  }

};


struct ConvertShardToMPIPass

    : public impl::ConvertShardToMPIPassBase<ConvertShardToMPIPass> {

  using Base::Base;


  /// Run the dialect converter on the module.

  void runOnOperation() override {

    auto *ctxt = &getContext();

    RewritePatternSet patterns(ctxt);

    ConversionTarget target(getContext());


    // Define a type converter to convert shard::ShardingType,

    // mostly for use in return operations.

    TypeConverter typeConverter;

    typeConverter.addConversion([](Type type) { return type; });


    // convert shard::ShardingType to a tuple of RankedTensorTypes

    typeConverter.addConversion(

        [](ShardingType type,

           SmallVectorImpl<Type> &results) -> std::optional<LogicalResult> {

          auto i16 = IntegerType::get(type.getContext(), 16);

          auto i64 = IntegerType::get(type.getContext(), 64);

          std::array<int64_t, 2> shp = {ShapedType::kDynamic,

                                        ShapedType::kDynamic};

          results.emplace_back(RankedTensorType::get(shp, i16));

          results.emplace_back(RankedTensorType::get(shp, i64)); // actually ?x2

          results.emplace_back(RankedTensorType::get(shp, i64));

          return success();

        });


    // To 'extract' components, a UnrealizedConversionCastOp is expected

    // to define the input

    typeConverter.addTargetMaterialization(

        [&](OpBuilder &builder, TypeRange resultTypes, ValueRange inputs,

            Location loc) {

          // Expecting a single input.

          if (inputs.size() != 1 || !isa<TupleType>(inputs[0].getType()))

            return SmallVector<Value>();

          auto castOp = inputs[0].getDefiningOp<UnrealizedConversionCastOp>();

          // Expecting an UnrealizedConversionCastOp.

          if (!castOp)

            return SmallVector<Value>();

          // Fill a vector with elements of the tuple/castOp.

          SmallVector<Value> results;

          for (auto oprnd : castOp.getInputs()) {

            if (!isa<RankedTensorType>(oprnd.getType()))

              return SmallVector<Value>();

            results.emplace_back(oprnd);

          }

          return results;

        });


    // No shard dialect should left after conversion...

    target.addIllegalDialect<shard::ShardDialect>();

    // ...except the global GridOp. GridShapeOp which will get folded later.

    target.addLegalOp<shard::GridOp, shard::GridShapeOp>();

    // Allow all the stuff that our patterns will convert to

    target.addLegalDialect<

        BuiltinDialect, mpi::MPIDialect, scf::SCFDialect, arith::ArithDialect,

        tensor::TensorDialect, bufferization::BufferizationDialect,

        linalg::LinalgDialect, memref::MemRefDialect, affine::AffineDialect>();

    // Make sure the function signature, calls etc. are legal

    target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {

      return typeConverter.isSignatureLegal(op.getFunctionType());

    });

    target.addDynamicallyLegalOp<func::CallOp, func::ReturnOp>(

        [&](Operation *op) { return typeConverter.isLegal(op); });


    patterns.add<ConvertUpdateHaloOp, ConvertNeighborsLinearIndicesOp,

                 ConvertProcessMultiIndexOp, ConvertGetShardingOp,

                 ConvertShardingOp, ConvertShardShapeOp, ConvertAllReduceOp,

                 ConvertProcessLinearIndexOp>(typeConverter, ctxt);


    populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(

        patterns, typeConverter);

    populateCallOpTypeConversionPattern(patterns, typeConverter);

    populateReturnOpTypeConversionPattern(patterns, typeConverter);


    (void)applyPartialConversion(getOperation(), target, std::move(patterns));


    // Folding patterns cannot be mixed with conversion patterns -> extra pass.

    patterns.clear();

    SymbolTableCollection symbolTableCollection;

    mlir::shard::populateFoldingPatterns(patterns, symbolTableCollection);

    (void)applyPatternsGreedily(getOperation(), std::move(patterns));

  }

};


} // namespace

success
return success()

AffineOps.h

Bufferization.h

Builders.h

DialectConversion.h

FuncConversions.h

FuncOps.h

GreedyPatternRewriteDriver.h

b
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
Definition LinalgTransformOps.cpp:2096

TypeRange
TypeRange
Definition LinalgTransformOps.cpp:2098

ValueRange
b ValueRange
Definition LinalgTransformOps.cpp:2102

target
target
Definition LinalgTransformOps.cpp:2099

getContext
b getContext())

MPI.h

PatternMatch.h

ShardDialect.h

shardShape
static void shardShape(const InShape &inShape, const GridShape &gridShape, const SplitAxes &splitAxes, OutShape &outShape, ArrayRef< int64_t > shardedDimsOffsets={}, ArrayRef< int64_t > haloSizes={})
Definition ShardOps.cpp:214

ShardOps.h

ShardToMPI.h

Simplifications.h

StaticValueUtils.h

SymbolTable.h

llvm::ArrayRef
Definition LLVM.h:48

llvm::SmallVector
Definition LLVM.h:72

mlir::DenseIntElementsAttr::get
static DenseIntElementsAttr get(const ShapedType &type, Arg &&arg)
Get an instance of a DenseIntElementsAttr with the given arguments.
Definition BuiltinAttributes.h:963

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76

mlir::Location::getContext
MLIRContext * getContext() const
Return the context this location is uniqued in.
Definition Location.h:86

mlir::OpBuilder
This class helps build Operations.
Definition Builders.h:207

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition Value.h:105

mlir::Value::getDefiningOp
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition Value.cpp:18

mlir::arith::ConstantIndexOp::create
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:359

mlir::impl::ConvertShardToMPIPassBase
Definition ShardToMPI.cpp:4382

Arith.h

Linalg.h

MemRef.h

SCF.h

Transforms.h

Tensor.h

BuiltinAttributes.h

BuiltinTypes.h

mlir::arith::getReductionOp
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition ArithOps.cpp:2786

mlir::remark::failed
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:561

mlir::shard
Definition ShardingInterface.h:23

mlir::shard::populateFoldingPatterns
void populateFoldingPatterns(RewritePatternSet &patterns, SymbolTableCollection &symbolTableCollection)
Definition Simplifications.cpp:114

mlir::shard::createProcessLinearIndex
TypedValue< IndexType > createProcessLinearIndex(StringRef grid, ArrayRef< GridAxis > gridAxes, ImplicitLocOpBuilder &builder)
Definition Transforms.cpp:228

mlir::shard::getMixedAsValues
SmallVector< Value > getMixedAsValues(OpBuilder b, const Location &loc, llvm::ArrayRef< int64_t > statics, ValueRange dynamics, Type type=Type())
Converts a vector of OpFoldResults (ints) into vector of Values of the provided type.
Definition ShardOps.cpp:77

mlir::shard::collectiveProcessGroupSize
int64_t collectiveProcessGroupSize(GridAxesRange &&gridAxes, GridShapeRange &&gridShape)
Definition ShardOps.h:146

mlir::shard::getGrid
shard::GridOp getGrid(Operation *op, FlatSymbolRefAttr gridSymbol, SymbolTableCollection &symbolTableCollection)
Definition ShardOps.h:121

mlir
Include the generated interface declarations.
Definition AliasAnalysis.h:19

mlir::getMixedValues
SmallVector< OpFoldResult > getMixedValues(ArrayRef< int64_t > staticValues, ValueRange dynamicValues, MLIRContext *context)
Return a vector of OpFoldResults with the same size a staticValues, but all elements for which Shaped...
Definition StaticValueUtils.cpp:198

mlir::populateReturnOpTypeConversionPattern
void populateReturnOpTypeConversionPattern(RewritePatternSet &patterns, const TypeConverter &converter, PatternBenefit benefit=1)
Add a pattern to the given pattern list to rewrite return ops to use operands that have been legalize...
Definition FuncConversions.cpp:164

mlir::applyPatternsGreedily
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
Definition GreedyPatternRewriteDriver.cpp:913

mlir::populateCallOpTypeConversionPattern
void populateCallOpTypeConversionPattern(RewritePatternSet &patterns, const TypeConverter &converter, PatternBenefit benefit=1)
Add a pattern to the given pattern list to convert the operand and result types of a CallOp with the ...
Definition FuncConversions.cpp:67

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition GreedyPatternRewriteDriver.h:283

mlir::ReassociationIndices
SmallVector< int64_t, 2 > ReassociationIndices
Definition Utils.h:27