doxygen/XeGPUSgToLaneDistribute_8cpp_source.html

//===- XeGPUSgToLaneDistribute.cpp - XeGPU SG to Lane Pass ----------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/IR/GPUDialect.h"

#include "mlir/Dialect/Index/IR/IndexDialect.h"

#include "mlir/Dialect/Math/IR/Math.h"

#include "mlir/Dialect/MemRef/IR/MemRef.h"

#include "mlir/Dialect/SCF/Transforms/Patterns.h"

#include "mlir/Dialect/Utils/IndexingUtils.h"

#include "mlir/Dialect/Vector/IR/VectorOps.h"

#include "mlir/Dialect/XeGPU/IR/XeGPU.h"

#include "mlir/Dialect/XeGPU/Transforms/Passes.h"

#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"

#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"

#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"

#include "mlir/Dialect/XeGPU/uArch/uArchCommon.h"

#include "mlir/IR/Builders.h"

#include "mlir/IR/BuiltinAttributes.h"

#include "mlir/IR/BuiltinOps.h"

#include "mlir/IR/BuiltinTypes.h"

#include "mlir/IR/MLIRContext.h"

#include "mlir/IR/Operation.h"

#include "mlir/IR/Value.h"

#include "mlir/IR/ValueRange.h"

#include "mlir/Transforms/DialectConversion.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/Support/LogicalResult.h"

#include "llvm/Support/raw_ostream.h"

#include <optional>


namespace mlir {

namespace xegpu {

#define GEN_PASS_DEF_XEGPUSGTOLANEDISTRIBUTE

#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"

} // namespace xegpu

} // namespace mlir


using namespace mlir;


#define DEBUG_TYPE "xegpu-sg-to-lane-distribute"

#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")


namespace {


/// Casts the given vector value `v` to the expected vector type `expectedTy`.

static Value castValueTo(ConversionPatternRewriter &rewriter,

                         TypedValue<VectorType> v, VectorType expectedTy) {

  // If the type matches, simply return the value itself.

  if (v.getType() == expectedTy)

    return v;

  // If only shape differs, use shape cast.

  if (isa<VectorType>(v.getType()) &&

      v.getType().getNumElements() == expectedTy.getNumElements())

    return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);


  // Else create an unrealized cast.

  auto newOp = UnrealizedConversionCastOp::create(rewriter, v.getLoc(),

                                                  expectedTy, ValueRange{v});

  return newOp.getResult(0);

}


/// A vector::MultiDimReductionOp at subgroup level in expected form if, it has

/// exactly 1 reduction dimension, it had valid result layout attribute, and

/// result type can be distributed to lanes using the layout.

static bool isValidSubgroupMultiReductionOp(vector::MultiDimReductionOp op) {

  auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));

  // If no layout, not valid.

  if (!resLayout || !resLayout.isForSubgroup())

    return false;

  // Scalar result (e.g., vector<32xf32> to f32) is valid.

  if (op.getType().isIntOrFloat())

    return op.getReductionDims().size() == 1;

  VectorType resTy = dyn_cast<VectorType>(op.getType());

  if (!resTy)

    return false;

  // Compute the distributed result vector type based on the layout.

  FailureOr<VectorType> resDistTypeOrFailure =

      getDistVecTypeBasedOnLaneLayout(resLayout, resTy);

  if (failed(resDistTypeOrFailure))

    return false;

  return op.getReductionDims().size() == 1;

}


/// A vector::MultiDimReductionOp is doing lane-local reduction if each lane

/// is doing its own local reduction. In this case the result layout ensures

/// that result vector is distributed to lanes, i.e. the result vector type is

/// different from the distributed result vector type.

static bool isReductionLaneLocal(vector::MultiDimReductionOp op) {

  // Must be valid MultiDimReductionOp.

  assert(isValidSubgroupMultiReductionOp(op) && "Expecting a valid subgroup "

                                                "MultiDimReductionOp");

  auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));

  VectorType resTy = dyn_cast<VectorType>(op.getType());

  auto resDistTypeOrFailure = getDistVecTypeBasedOnLaneLayout(resLayout, resTy);

  return resTy != resDistTypeOrFailure.value();

}


/// Given a vector type and its distributed vector type, return the list of

/// dimensions that are distributed.

static SmallVector<int64_t> getDistributedDims(VectorType originalType,

                                               VectorType distributedType) {

  assert(originalType.getRank() == distributedType.getRank() &&

         "original and distributed vector types must have the same rank");

  SmallVector<int64_t> distributedDims;

  for (int64_t i = 0; i < originalType.getRank(); ++i) {

    if (distributedType.getDimSize(i) != originalType.getDimSize(i))

      distributedDims.push_back(i);

  }

  return distributedDims;

}


/// Distributes a subgroup-level CreateNdDesc op to lane-level CreateNdDesc

/// op. This simply drops the layout attribute from the tensor descriptor type.

struct SgToLaneCreateNdDesc

    : public OpConversionPattern<xegpu::CreateNdDescOp> {

  using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(xegpu::CreateNdDescOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::TensorDescType resultType = op.getType();

    // If no layout, nothing to do.

    if (!resultType.getLayout())

      return failure();


    auto newOp = xegpu::CreateNdDescOp::create(

        rewriter, op.getLoc(), resultType.dropLayouts(), op.getOperands(),

        op->getAttrs());

    rewriter.replaceOp(op, newOp.getResult());

    return success();

  }

};


/// Distributes a subgroup-level LoadNd op to lane-level LoadNd op. Output

/// of lane-level LoadNd op is 1D. ShapeCast is added to restore the

/// original rank.

struct SgToLaneLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {

  using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(xegpu::LoadNdOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();

    // If no layout, nothing to do.

    if (!layout)

      return failure();

    // Check if the layout attached to the tensor descriptor is same as the

    // anchor layout. Otherwise, this is a conflict.

    if (op.getTensorDescType().getLayout() != layout)

      return rewriter.notifyMatchFailure(

          op, "conflicting layout attributes on tensor descriptor and anchor");

    const auto *uArch =

        xegpu::uArch::getUArch(xegpu::getChipStr(op).value_or(""));

    if (!uArch)

      return rewriter.notifyMatchFailure(

          op, "xegpu::LoadNdOp require target attribute attached to "

              "determine transpose "

              "requirement");

    auto supportedLaneResultTyOrFailure =

        xegpu::getDistributedVectorType(op.getTensorDescType());

    auto expectedLaneResultTyOrFailure =

        xegpu::getDistVecTypeBasedOnLaneLayout(layout, op.getType());

    if (failed(supportedLaneResultTyOrFailure))

      return rewriter.notifyMatchFailure(

          op, "unable to compute the lane vector type for LoadNdOp");

    if (failed(expectedLaneResultTyOrFailure))

      return rewriter.notifyMatchFailure(

          op, "unable to compute expected lane vector type from lane layout");

    auto newOp = xegpu::LoadNdOp::create(

        rewriter, op.getLoc(), supportedLaneResultTyOrFailure.value(),

        adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),

        op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),

        op.getL3HintAttr(), /**layout**/ nullptr);

    // Set the packed attribute if the layout requires it.

    newOp.setPacked(xegpu::requirePacked(cast<xegpu::LayoutAttr>(layout)));

    // Set the transpose attribute if the layout requires it.

    if (xegpu::requireTranspose(cast<xegpu::LayoutAttr>(layout), uArch))

      newOp.setTranspose(DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));

    rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),

                                       expectedLaneResultTyOrFailure.value()));

    return success();

  }

};


/// Distributes a subgroup-level StoreNd op to lane-level StoreNd op. Stored

/// value in lane-level StoreNd op is 1D. ShapeCast is added to cast the

/// incoming value to 1D.

struct SgToLaneStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {

  using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(xegpu::StoreNdOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();

    // If no layout, nothing to do.

    if (!layout)

      return failure();

    // Check if the layout attached to the tensor descriptor and value layout is

    // same as the anchor layout. Otherwise, this is a conflict.

    if (op.getTensorDescType().getLayout() != layout)

      return rewriter.notifyMatchFailure(

          op, "conflicting layout attributes on tensor descriptor and anchor");

    auto valueLayout = xegpu::getDistributeLayoutAttr(op->getOpOperand(0));

    if (valueLayout != layout)

      return rewriter.notifyMatchFailure(

          op, "conflicting layout attributes on value and anchor");

    auto supportedLaneValueTyOrFailure =

        xegpu::getDistributedVectorType(op.getTensorDescType());

    if (failed(supportedLaneValueTyOrFailure))

      return rewriter.notifyMatchFailure(

          op,

          "unable to compute lane vector type for StoreNdOp value from tensor "

          "descriptor");


    xegpu::StoreNdOp::create(

        rewriter, op.getLoc(),

        castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getValue()),

                    supportedLaneValueTyOrFailure.value()),

        adaptor.getTensorDesc(), op.getMixedOffsets(), op.getL1HintAttr(),

        op.getL2HintAttr(), op.getL3HintAttr(), /**layout**/ nullptr);

    rewriter.eraseOp(op);

    return success();

  }

};


/// Distributes a subgroup-level Dpas op to lane-level Dpas op. All inpputs

/// and output of lane-level Dpas op are 1D. Necessary casts are added to

/// convert the inputs and output to/from 1D.

struct SgToLaneDpas : public OpConversionPattern<xegpu::DpasOp> {

  using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(xegpu::DpasOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    // Check if the op has A, B and CD layouts attached.

    auto layoutA = cast<xegpu::LayoutAttr>(op.getLayoutAAttr());

    auto layoutB = cast<xegpu::LayoutAttr>(op.getLayoutBAttr());

    auto layoutCd = cast<xegpu::LayoutAttr>(op.getLayoutCdAttr());

    if (!layoutA || !layoutB || !layoutCd)

      return failure();

    auto laneResultTyOrFailure =

        xegpu::getDistributedVectorType(op.getType(), layoutCd);

    auto laneATypeOrFailure =

        xegpu::getDistributedVectorType(op.getLhs().getType(), layoutA);

    auto laneBTypeOrFailure =

        xegpu::getDistributedVectorType(op.getRhs().getType(), layoutB);

    auto expectedLaneResultTyOrFailure =

        xegpu::getDistVecTypeBasedOnLaneLayout(layoutCd, op.getType());

    if (failed(laneResultTyOrFailure) || failed(laneATypeOrFailure) ||

        failed(laneBTypeOrFailure))

      return rewriter.notifyMatchFailure(

          op, "failed to calculate supported lane vector types for DpasOp "

              "from layouts");

    if (failed(expectedLaneResultTyOrFailure))

      return rewriter.notifyMatchFailure(

          op, "unable to compute expected lane vector type for DpasOp from "

              "lane layout");


    // Validate bit widths match uArch packed format requirements

    const auto *uArch =

        xegpu::uArch::getUArch(xegpu::getChipStr(op).value_or(""));

    if (uArch) {

      const auto *uArchInstruction =

          dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(

              uArch->getInstruction(

                  xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));

      if (uArchInstruction) {

        auto laneAType = laneATypeOrFailure.value();

        auto laneBType = laneBTypeOrFailure.value();

        // Calculate total packed bit width = element bit width * vector size

        unsigned aPackedBitWidth =

            laneAType.getElementTypeBitWidth() * laneAType.getNumElements();

        unsigned bPackedBitWidth =

            laneBType.getElementTypeBitWidth() * laneBType.getNumElements();

        unsigned expectedABitSize = uArchInstruction->getPackedFormatBitSizeA();

        unsigned expectedBBitSize = uArchInstruction->getPackedFormatBitSizeB();


        if (aPackedBitWidth % expectedABitSize != 0)

          return rewriter.notifyMatchFailure(

              op,

              "A operand packed bit width must be a multiple of uArch packed "

              "format requirement");

        if (bPackedBitWidth % expectedBBitSize != 0)

          return rewriter.notifyMatchFailure(

              op,

              "B operand packed bit width must be a multiple of uArch packed "

              "format requirement");

      }

    }


    auto newOp = xegpu::DpasOp::create(

        rewriter, op->getLoc(), laneResultTyOrFailure.value(),

        castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),

                    laneATypeOrFailure.value()),

        castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getRhs()),

                    laneBTypeOrFailure.value()),

        castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getAcc()),

                    laneResultTyOrFailure.value()),

        /** layoutA**/ nullptr,

        /** layoutB**/ nullptr, /** layoutCd**/ nullptr);

    // Explicitly set the new types to enable correct type materializations.

    rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),

                                       expectedLaneResultTyOrFailure.value()));

    return success();

  }

};


/// Distributes elementwise ops to lane-level elementwise ops. This

/// currently handles elementwise ops with single result only.

struct SgToLaneElementWise : public ConversionPattern {

  SgToLaneElementWise(TypeConverter &typeConverter, MLIRContext *ctx)

      : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}


  LogicalResult

  matchAndRewrite(Operation *op, ArrayRef<Value> operands,

                  ConversionPatternRewriter &rewriter) const override {

    // Only match ops with elementwise trait and single result.

    if (!OpTrait::hasElementwiseMappableTraits(op) || op->getNumResults() != 1)

      return failure();


    auto resultType = dyn_cast<VectorType>(op->getResult(0).getType());

    if (!resultType)

      return rewriter.notifyMatchFailure(

          op, "operation result is not a vector type");


    xegpu::DistributeLayoutAttr layout =

        xegpu::getTemporaryLayout(llvm::cast<OpResult>(op->getResult(0)));

    if (!layout || !layout.isForSubgroup())

      return rewriter.notifyMatchFailure(

          op, "operation result does not have subgroup distribute layout");


    auto laneShapeOrFailure =

        xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);


    if (failed(laneShapeOrFailure))

      return rewriter.notifyMatchFailure(

          op, "unable to compute lane vector type from the layout");


    VectorType newResultType = laneShapeOrFailure.value();

    OperationState state(op->getLoc(), op->getName());

    state.addOperands(operands);

    state.addTypes(newResultType);

    // Copy all attributes except for DistributeLayoutAttr.

    for (auto attr : op->getAttrs()) {

      if (!isa<xegpu::DistributeLayoutAttr>(attr.getValue()))

        state.addAttribute(attr.getName(), attr.getValue());

    }

    Operation *newOp = rewriter.create(state);


    rewriter.replaceOp(op, newOp->getResult(0));

    return success();

  }

};


/// Distributes a subgroup-level arith ConstantOp to lane-level arith

/// ConstantOp.

///

/// Splat constants are rebuilt with the lane-local vector type. Non-splat

/// constants are distributed by extracting each lane_data-sized block from

/// the full constant and inserting it at the correct position in the

/// distributed vector using insert_strided_slice.

struct SgToLaneArithConstant : public OpConversionPattern<arith::ConstantOp> {

  using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    auto resultType = dyn_cast<VectorType>(op.getType());

    if (!resultType)

      return failure();


    // Only handle dense vector constants.

    auto denseAttr = dyn_cast<DenseElementsAttr>(op.getValue());

    if (!denseAttr)

      return rewriter.notifyMatchFailure(

          op, "only dense vector constants are supported");


    xegpu::DistributeLayoutAttr layout =

        xegpu::getTemporaryLayout(llvm::cast<OpResult>(op.getResult()));

    if (!layout || !layout.isForSubgroup())

      return rewriter.notifyMatchFailure(

          op, "operation result does not have subgroup distribute layout");


    auto laneShapeOrFailure =

        xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);


    if (failed(laneShapeOrFailure))

      return rewriter.notifyMatchFailure(

          op, "unable to compute lane vector type from the layout");


    VectorType newResultType = laneShapeOrFailure.value();

    Location loc = op.getLoc();


    // Splat constants: every lane gets the same value, so just rebuild the

    // splat with the distributed type.

    if (denseAttr.isSplat()) {

      auto scalarValue = denseAttr.getSplatValue<Attribute>();

      auto newDenseAttr = DenseElementsAttr::get(newResultType, scalarValue);

      auto newOp =

          arith::ConstantOp::create(rewriter, loc, newResultType, newDenseAttr);

      rewriter.replaceOp(op, newOp.getResult());

      return success();

    }


    // Non-splat constants: each lane extracts the elements it owns from the

    // full constant using the distributed coordinates from the layout.

    auto fullConst =

        arith::ConstantOp::create(rewriter, loc, resultType, denseAttr);


    Value laneId = gpu::LaneIdOp::create(rewriter, loc, rewriter.getIndexType(),

                                         /*upperBound=*/mlir::IntegerAttr());

    auto maybeCoordsVec = layout.computeDistributedCoords(

        rewriter, loc, laneId, resultType.getShape());

    if (failed(maybeCoordsVec))

      return rewriter.notifyMatchFailure(

          op, "failed to compute distributed coordinates from layout");


    SmallVector<SmallVector<Value>> coordsVec = maybeCoordsVec.value();

    SmallVector<int64_t> laneData = layout.getEffectiveLaneDataAsInt();

    ArrayRef<int64_t> distShape = newResultType.getShape();

    int64_t rank = newResultType.getRank();


    // Each lane owns one lane_data-sized block per distribution unit.

    // computeDistributedCoords returns those block starts in row-major order

    // over the block grid (distShape / laneData).

    SmallVector<int64_t> blockGridShape(rank);

    for (int64_t d = 0; d < rank; d++)

      blockGridShape[d] = distShape[d] / laneData[d];

    SmallVector<int64_t> blockGridStrides = computeStrides(blockGridShape);


    auto blockType = VectorType::get(laneData, newResultType.getElementType());

    SmallVector<int64_t> unitTile(rank, 1);

    SmallVector<int64_t> strides(rank, 1);


    Value result = arith::ConstantOp::create(

        rewriter, loc, newResultType, rewriter.getZeroAttr(newResultType));


    for (auto [blockIdx, blockStart] : llvm::enumerate(coordsVec)) {

      // Gather the block's elements from the full constant. The block start is

      // lane-dynamic, so extract element-by-element (row-major over lane_data)

      // instead.

      SmallVector<Value> blockElems;

      for (SmallVector<int64_t> off :

           StaticTileOffsetRange(laneData, unitTile)) {

        SmallVector<OpFoldResult> pos(rank);

        for (int64_t d = 0; d < rank; d++)

          pos[d] = getAsOpFoldResult(arith::AddIOp::create(

              rewriter, loc, blockStart[d],

              arith::ConstantIndexOp::create(rewriter, loc, off[d])));

        blockElems.push_back(vector::ExtractOp::create(

            rewriter, loc, fullConst.getResult(), pos));

      }


      // Rebuild the block keeping its lane_data shape, then place it with

      // insert_strided_slice so the block keeps its orientation in the

      // distributed vector (e.g. a [2, 1] block stays a vertical 2x1 slice).

      Value block =

          vector::FromElementsOp::create(rewriter, loc, blockType, blockElems);

      SmallVector<int64_t> blockGridPos =

          delinearize(blockIdx, blockGridStrides);

      SmallVector<int64_t> offsets(rank);

      for (int64_t d = 0; d < rank; d++)

        offsets[d] = blockGridPos[d] * laneData[d];

      result = vector::InsertStridedSliceOp::create(rewriter, loc, block,

                                                    result, offsets, strides);

    }


    rewriter.replaceOp(op, result);

    return success();

  }

};


/// Distributes a subgroup-level PrefetchNd op to lane-level PrefetchNd op.

struct SgToLanePrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {

  using OpConversionPattern<xegpu::PrefetchNdOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(xegpu::PrefetchNdOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();

    // If no layout, nothing to do.

    if (!layout)

      return failure();


    xegpu::PrefetchNdOp::create(rewriter, op.getLoc(), adaptor.getTensorDesc(),

                                op.getMixedOffsets(), op.getL1HintAttr(),

                                op.getL2HintAttr(), op.getL3HintAttr(),

                                /**layout**/ nullptr);

    rewriter.eraseOp(op);

    return success();

  }

};


/// Distributes a subgroup-level LoadGather (xegpu.load) op to lane-level.

///

/// Example 1 (1D, no chunk size):

///   layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>

///   %mask = producer_op : vector<16xi1>

///   %offset = producer_op : vector<16xindex>

///   %0 = xegpu.load %src[%offset], %mask : memref<256xf16>,

///     vector<16xindex>, vector<16xi1> -> vector<16xf16>

/// Distributed to:

///   %mask = producer_op : vector<1xi1>

///   %offset = producer_op : vector<1xindex>

///   %0 = xegpu.load %src[%offset], %mask : memref<256xf16>,

///     vector<1xindex>, vector<1xi1> -> vector<1xf16>

///

/// Example 2 (2D with chunk size, same mask & offset):

///   layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>

///   %0 = xegpu.load %src[%offset], %mask <{chunk_size=8}> :

///     memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>

/// Distributed to:

///   %0 = xegpu.load %src[%offset], %mask <{chunk_size=8}> :

///     memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>

///

/// Example 3 (3D with leading unit dims):

///   layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>

///   %mask = producer_op : vector<1x1x16xi1>

///   %offset = producer_op : vector<1x1x16xindex>

///   %0 = xegpu.load %src[%offset], %mask : memref<256xf16>,

///     vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf16>

/// Distributed to:

///   %mask = producer_op : vector<1x1x1xi1>

///   %offset = producer_op : vector<1x1x1xindex>

///   %0 = xegpu.load %src[%offset], %mask : memref<256xf16>,

///     vector<1xindex>, vector<1xi1> -> vector<1xf16>

struct SgToLaneLoadGather : public OpConversionPattern<xegpu::LoadGatherOp> {

  using OpConversionPattern<xegpu::LoadGatherOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(xegpu::LoadGatherOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();

    if (!layout)

      return failure();


    VectorType origResultTy = op.getValueType();

    if (!origResultTy)

      return failure();


    // Check that leading dimensions are unit.

    int chunkSize = op.getChunkSize().value_or(1);

    int effectiveVecRank = (chunkSize == 1) ? 1 : 2;

    ArrayRef<int64_t> shape = origResultTy.getShape();

    if (llvm::any_of(

            shape.take_front(origResultTy.getRank() - effectiveVecRank),

            [](int64_t d) { return d != 1; }))

      return rewriter.notifyMatchFailure(

          op, "Only unit dimensions allowed for the leading "

              "dimensions of the load vector!");


    auto distResultTyOrFailure =

        xegpu::getDistVecTypeBasedOnLaneLayout(layout, origResultTy);

    if (failed(distResultTyOrFailure))

      return rewriter.notifyMatchFailure(

          op, "unable to compute expected lane vector type from lane layout");


    VectorType distResultTy = distResultTyOrFailure.value();

    VectorType distResultTy1D = VectorType::get({distResultTy.getNumElements()},

                                                distResultTy.getElementType());


    // Flatten offsets and mask to 1D to match the 1D result type.

    Value distOffsets = adaptor.getOffsets();

    auto distOffsetsTy = cast<VectorType>(distOffsets.getType());

    VectorType offsetsTy1D = VectorType::get({distOffsetsTy.getNumElements()},

                                             distOffsetsTy.getElementType());

    distOffsets = castValueTo(

        rewriter, cast<TypedValue<VectorType>>(distOffsets), offsetsTy1D);


    Value distMask = adaptor.getMask();

    auto distMaskTy = cast<VectorType>(distMask.getType());

    VectorType maskTy1D = VectorType::get({distMaskTy.getNumElements()},

                                          distMaskTy.getElementType());

    distMask =

        castValueTo(rewriter, cast<TypedValue<VectorType>>(distMask), maskTy1D);


    Value distSource = adaptor.getSource();

    auto newOp = xegpu::LoadGatherOp::create(

        rewriter, op.getLoc(), distResultTy1D, distSource, distOffsets,

        distMask, op.getChunkSizeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),

        op.getL3HintAttr(), /*layout=*/nullptr, /*contiguity=*/nullptr);


    Value result = newOp->getResult(0);

    if (distResultTy1D != distResultTy)

      result = castValueTo(rewriter, cast<TypedValue<VectorType>>(result),

                           distResultTy);

    rewriter.replaceOp(op, result);

    return success();

  }

};


/// This pattern distributes a subgroup-level vector.reduction op to

/// lane-level. This require shuffling the data across the lanes (using

/// gpu::ShuffleOp) and reducing in stages until all lanes have the final

/// result.

struct SgToLaneVectorReduction

    : public OpConversionPattern<vector::ReductionOp> {

  using OpConversionPattern<vector::ReductionOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::ReductionOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    auto layout = xegpu::getDistributeLayoutAttr(op.getVector());


    // If no layout, nothing to do.

    if (!layout || !layout.isForSubgroup())

      return failure();


    VectorType srcVecType = op.getSourceVectorType();

    // Only rank 1 vectors supported.

    if (srcVecType.getRank() != 1)

      return rewriter.notifyMatchFailure(

          op, "Only rank 1 reductions can be distributed.");

    // Lane layout must have the same rank as the vector.

    if (layout.getRank() != srcVecType.getRank())

      return rewriter.notifyMatchFailure(

          op, "Layout rank does not match vector rank.");


    // Get the subgroup size from the layout.

    int64_t sgSize = layout.getEffectiveLaneLayoutAsInt()[0];

    const auto *uArch =

        xegpu::uArch::getUArch(xegpu::getChipStr(op).value_or(""));

    if (!uArch)

      return rewriter.notifyMatchFailure(

          op, "xegpu::ReductionOp require target attribute attached to "

              "determine subgroup size");


    // Only subgroup-sized vectors supported.

    if (sgSize != uArch->getSubgroupSize() ||

        srcVecType.getShape()[0] % sgSize != 0)

      return rewriter.notifyMatchFailure(op,

                                         "Invalid layout or reduction vector "

                                         "dimension must match subgroup size.");


    if (!op.getType().isIntOrFloat())

      return rewriter.notifyMatchFailure(

          op, "Reduction distribution currently only supports floats and "

              "integer types.");


    // Get the distributed vector (per lane portion).

    Value laneValVec = adaptor.getVector();


    // Distribute and reduce across lanes in the subgroup.

    Value fullReduce = xegpu::subgroupReduction(

        op.getLoc(), rewriter, laneValVec, op.getKind(), sgSize);


    // If there's an accumulator, combine it with the reduced value.

    if (adaptor.getAcc())

      fullReduce = vector::makeArithReduction(

          rewriter, op.getLoc(), op.getKind(), fullReduce, adaptor.getAcc());


    rewriter.replaceOp(op, fullReduce);

    return success();

  }

};


/// This pattern distributes a subgroup-level vector.multi_reduction op to

/// lane-level only if the reduction is lane-local. This means that

/// reduction dimension is not distributed to lanes and each lane does its own

/// local reduction.

struct SgToLaneMultiDimReduction

    : public OpConversionPattern<vector::MultiDimReductionOp> {

  using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    Value result;

    ArrayRef<int64_t> reductionDims = op.getReductionDims();

    assert(reductionDims.size() == 1 &&

           "Expecting single reduction dimension for subgroup multi "

           "reduction op");

    // For rank > 2, ensure leading dimensions are unit.

    VectorType sourceType = op.getSourceVectorType();

    int64_t rank = sourceType.getRank();

    if (rank > 2) {

      ArrayRef<int64_t> shape = sourceType.getShape();

      if (llvm::any_of(shape.take_front(rank - 2),

                       [](int64_t d) { return d != 1; }))

        return rewriter.notifyMatchFailure(

            op, "only unit leading dimensions are supported for "

                "multi_reduction with rank > 2");

    }

    // Handle scalar result: full reduction of a distributed vector to a

    // scalar. First do a local vector reduction, then cross-lane shuffles.

    if (op.getType().isIntOrFloat()) {

      auto reductionDim = reductionDims[0];

      VectorType origSourceType = op.getSourceVectorType();

      int64_t reductionDimSize = origSourceType.getShape()[reductionDim];

      // Local reduction to scalar, then cross-lane butterfly shuffles.

      result =

          xegpu::subgroupReduction(op.getLoc(), rewriter, adaptor.getSource(),

                                   op.getKind(), reductionDimSize);

      // Combine with accumulator if present.

      if (adaptor.getAcc())

        result = vector::makeArithReduction(rewriter, op.getLoc(), op.getKind(),

                                            result, adaptor.getAcc());

    } else if (isReductionLaneLocal(op)) {

      // For lane-local reduction, lower to a sequence of vector.reduction ops

      // over 1D slices extracted from the distributed source vector. This is

      // required so we dont have 2D source vectors at xegpu-linearize.

      auto reductionDim = reductionDims[0];

      result = xegpu::lowerToVectorReductions(

          cast<TypedValue<VectorType>>(adaptor.getSource()),

          cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),

          reductionDim, op.getLoc(), rewriter);

    } else {

      auto reductionDim = reductionDims[0];

      VectorType sourceType = op.getSourceVectorType();

      int64_t reductionDimSize = sourceType.getShape()[reductionDim];

      result = xegpu::lowerCrossLaneReductionToShuffles(

          cast<TypedValue<VectorType>>(adaptor.getSource()),

          cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),

          reductionDim, reductionDimSize, op.getLoc(), rewriter);

    }

    rewriter.replaceOp(op, result);

    return success();

  }

};


/// Helper to compute distributed coordinates for matrix ops.

/// When not using subgroup_block_io, each lane computes its own

/// coordinates based on the layout and lane ID.

static SmallVector<Value> computeDistributedCoordsForMatrixOp(

    ConversionPatternRewriter &rewriter, Location loc,

    xegpu::DistributeLayoutAttr layout, ArrayRef<int64_t> payloadShape,

    ValueRange origOffsets) {

  Value laneId = gpu::LaneIdOp::create(rewriter, loc, rewriter.getIndexType(),

                                       /*upperBound=*/mlir::IntegerAttr());

  auto maybeCoords =

      layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);

  if (failed(maybeCoords))

    return {};

  assert(maybeCoords.value().size() == 1 &&

         "Expected one set of distributed offsets");

  SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(

      rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),

      getAsOpFoldResult(origOffsets));

  return llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);

}


/// This pattern distributes a subgroup-level LoadMatrix op to lane-level.

struct SgToLaneLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {

  using OpConversionPattern<xegpu::LoadMatrixOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(xegpu::LoadMatrixOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    auto layout = op.getLayoutAttr();

    // If no layout, nothing to do.

    if (!layout)

      return failure();


    VectorType sgPayloadTy = dyn_cast<VectorType>(op.getResult().getType());

    if (!sgPayloadTy)

      return rewriter.notifyMatchFailure(

          op, "the matrix op payload must be a vector type");


    auto loc = op.getLoc();

    auto offsets = op.getMixedOffsets();

    if (offsets.empty())

      return rewriter.notifyMatchFailure(op, "the load op must have offsets");


    FailureOr<VectorType> distPayloadTyOrFailure =

        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);

    if (failed(distPayloadTyOrFailure))

      return rewriter.notifyMatchFailure(

          op, "Failed to distribute matrix op payload based on layout.");


    SmallVector<Value> offsetsAsValues =

        vector::getAsValues(rewriter, loc, offsets);


    SmallVector<Value> newCoords = offsetsAsValues;

    if (!op.getSubgroupBlockIoAttr()) {

      newCoords = computeDistributedCoordsForMatrixOp(

          rewriter, loc, layout, sgPayloadTy.getShape(), offsetsAsValues);

      if (newCoords.empty())

        return rewriter.notifyMatchFailure(

            op, "Failed to compute distributed coordinates.");

    }


    SmallVector<int64_t> newConstOffsets(op.getConstOffsets().size(),

                                         ShapedType::kDynamic);

    DenseI64ArrayAttr newConstOffsetsAttr =

        rewriter.getDenseI64ArrayAttr(newConstOffsets);


    auto newOp = xegpu::LoadMatrixOp::create(

        rewriter, loc, *distPayloadTyOrFailure, adaptor.getMemDesc(),

        ValueRange(newCoords), newConstOffsetsAttr, op.getSubgroupBlockIoAttr(),

        xegpu::DistributeLayoutAttr{});

    rewriter.replaceOp(op, newOp.getResult());

    return success();

  }

};


/// Distributes a subgroup-level vector.transpose op to lane-level.

struct SgToLaneVectorTranspose

    : public OpConversionPattern<vector::TransposeOp> {

  using OpConversionPattern<vector::TransposeOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::TransposeOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr sourceLayout =

        xegpu::getTemporaryLayout(op->getOpOperand(0));

    xegpu::DistributeLayoutAttr resultLayout =

        xegpu::getTemporaryLayout(op->getOpResult(0));

    if (!sourceLayout || !resultLayout)

      return rewriter.notifyMatchFailure(

          op, "the source or result vector of the transpose op lacks layout "

              "attribute");

    ArrayRef<int64_t> perm = op.getPermutation();

    // Result layout must be a transpose of source layout.

    if (!resultLayout.isTransposeOf(sourceLayout, perm,

                                    xegpu::LayoutKind::Lane))

      return rewriter.notifyMatchFailure(

          op, "the source or result vector layouts must be transposes of "

              "each other");

    FailureOr<VectorType> distributedResultTypeOrFailure =

        getDistVecTypeBasedOnLaneLayout(resultLayout, op.getResultVectorType());

    if (failed(distributedResultTypeOrFailure))

      return rewriter.notifyMatchFailure(

          op, "Failed to distribute the result vector type in "

              "vector::Transpose op");

    auto newOp = vector::TransposeOp::create(rewriter, op.getLoc(),

                                             adaptor.getVector(), perm);

    rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),

                                       distributedResultTypeOrFailure.value()));

    return success();

  }

};


/// Distributes a subgroup-level vector.bitcast op to lane-level.

/// Bitcast only impacts the innermost dimension of the source/result vectors.

struct SgToLaneVectorBitcast : public OpConversionPattern<vector::BitCastOp> {

  using OpConversionPattern<vector::BitCastOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::BitCastOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr resultLayout =

        xegpu::getTemporaryLayout(op->getOpResult(0));

    if (!resultLayout)

      return rewriter.notifyMatchFailure(

          op, "result vector of the bitcast op lacks layout attribute");

    FailureOr<VectorType> distributedResultTypeOrFailure =

        getDistVecTypeBasedOnLaneLayout(resultLayout, op.getResultVectorType());

    if (failed(distributedResultTypeOrFailure))

      return rewriter.notifyMatchFailure(

          op, "Failed to distribute the result vector type in "

              "vector::BitCast op");

    auto newOp = vector::BitCastOp::create(

        rewriter, op.getLoc(), distributedResultTypeOrFailure.value(),

        adaptor.getSource());

    rewriter.replaceOp(op, newOp.getResult());

    return success();

  }

};


/// Distributes a subgroup-level vector.create_mask or vector.constant_mask op

/// to lane-level. Uses `computeDistributedCoords()` to obtain the

/// coordinates each lane owns, then compares each coordinate against the

/// original mask bounds using `arith.cmpi slt`. The per-element boolean

/// results are assembled into the distributed mask vector.

///

/// For multi-dimensional masks, the element is in-bounds when ALL dimensions

/// satisfy `coord[i] < bound[i]`.

///

/// Example (1D):

///   layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>

///   %mask = vector.create_mask %m0 : vector<16xi1>

/// For lane k, computeDistributedCoords gives coord = [k], so:

///   %in_bounds = arith.cmpi slt, %coord, %m0  →  i1

///   %mask = vector.broadcast %in_bounds : i1 to vector<1xi1>

///

/// Example (2D):

///   layout = #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>

///   %mask = vector.create_mask %m0, %m1 : vector<8x4xi1>

/// Each WI owns a 1x2 slice. computeDistributedCoords returns 2 coords:

///   [[r0, c0], [r0, c1]]

/// For each coord: in_bounds = (r < m0) && (c < m1)

///   %mask = vector.from_elements %bit0, %bit1 : vector<1x2xi1>

template <typename OpType,

          typename = std::enable_if_t<llvm::is_one_of<

              OpType, vector::CreateMaskOp, vector::ConstantMaskOp>::value>>

struct SgToLaneCreateMask : public OpConversionPattern<OpType> {

  using OpConversionPattern<OpType>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(OpType op, typename OpType::Adaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr layout =

        xegpu::getTemporaryLayout(op->getOpResult(0));

    if (!layout || !layout.isForSubgroup())

      return rewriter.notifyMatchFailure(

          op, "operation result does not have subgroup distribute layout");


    VectorType origType = op.getType();

    FailureOr<VectorType> distTypeOrFailure =

        getDistVecTypeBasedOnLaneLayout(layout, origType);

    if (failed(distTypeOrFailure))

      return rewriter.notifyMatchFailure(

          op, "unable to compute lane vector type from the layout");


    VectorType distType = distTypeOrFailure.value();

    Location loc = op.getLoc();


    // Materialize the original mask bounds as Values.

    SmallVector<Value> origBounds;

    if constexpr (std::is_same_v<OpType, vector::CreateMaskOp>) {

      origBounds.append(op.getOperands().begin(), op.getOperands().end());

    } else {

      auto dimSizes = op.getMaskDimSizesAttr().asArrayRef();

      for (auto dimSize : dimSizes)

        origBounds.push_back(

            arith::ConstantIndexOp::create(rewriter, loc, dimSize).getResult());

    }


    ArrayRef<int64_t> origShape = origType.getShape();


    // Use computeDistributedCoords to get the coordinates each WI owns.

    Value laneId = gpu::LaneIdOp::create(rewriter, loc, rewriter.getIndexType(),

                                         /*upperBound=*/mlir::IntegerAttr());

    auto maybeCoordsVec =

        layout.computeDistributedCoords(rewriter, loc, laneId, origShape);

    if (failed(maybeCoordsVec))

      return rewriter.notifyMatchFailure(

          op, "failed to compute distributed coordinates from layout");


    SmallVector<SmallVector<Value>> coordsVec = maybeCoordsVec.value();

    int64_t numElements = distType.getNumElements();

    assert(static_cast<int64_t>(coordsVec.size()) == numElements &&

           "number of coordinate sets must match number of distributed "

           "elements");


    // For each element, compare all coordinates against bounds.

    Value trueVal =

        arith::ConstantIntOp::create(rewriter, loc, /*value=*/1, /*width=*/1);

    SmallVector<Value> maskBits;

    for (auto &coords : coordsVec) {

      Value inBounds = trueVal;

      for (size_t i = 0; i < coords.size(); ++i) {

        Value cmp = arith::CmpIOp::create(

            rewriter, loc, arith::CmpIPredicate::slt, coords[i], origBounds[i]);

        inBounds = arith::AndIOp::create(rewriter, loc, inBounds, cmp);

      }

      maskBits.push_back(inBounds);

    }


    // Build the distributed mask vector.

    Value result;

    if (numElements == 1) {

      result =

          vector::BroadcastOp::create(rewriter, loc, distType, maskBits[0]);

    } else {

      result =

          vector::FromElementsOp::create(rewriter, loc, distType, maskBits);

    }

    rewriter.replaceOp(op, result);

    return success();

  }

};


/// This pattern distributes a subgroup-level StoreMatrix op to lane-level.

struct SgToLaneStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {

  using OpConversionPattern<xegpu::StoreMatrixOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(xegpu::StoreMatrixOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    auto layout = op.getLayoutAttr();

    // If no layout, nothing to do.

    if (!layout)

      return failure();


    VectorType sgPayloadTy = dyn_cast<VectorType>(op.getData().getType());

    if (!sgPayloadTy)

      return rewriter.notifyMatchFailure(

          op, "the matrix op payload must be a vector type");


    auto loc = op.getLoc();

    auto offsets = op.getMixedOffsets();

    if (offsets.empty())

      return rewriter.notifyMatchFailure(op, "the store op must have offsets");


    FailureOr<VectorType> distPayloadTyOrFailure =

        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);

    if (failed(distPayloadTyOrFailure))

      return rewriter.notifyMatchFailure(

          op, "Failed to distribute matrix op payload based on layout.");


    SmallVector<Value> offsetsAsValues =

        vector::getAsValues(rewriter, loc, offsets);


    SmallVector<Value> newCoords = offsetsAsValues;

    if (!op.getSubgroupBlockIoAttr()) {

      newCoords = computeDistributedCoordsForMatrixOp(

          rewriter, loc, layout, sgPayloadTy.getShape(), offsetsAsValues);

      if (newCoords.empty())

        return rewriter.notifyMatchFailure(

            op, "Failed to compute distributed coordinates.");

    }


    SmallVector<int64_t> newConstOffsets(op.getConstOffsets().size(),

                                         ShapedType::kDynamic);

    DenseI64ArrayAttr newConstOffsetsAttr =

        rewriter.getDenseI64ArrayAttr(newConstOffsets);


    xegpu::StoreMatrixOp::create(

        rewriter, loc, TypeRange{},

        castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getData()),

                    distPayloadTyOrFailure.value()),

        adaptor.getMemDesc(), ValueRange(newCoords), newConstOffsetsAttr,

        op.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});

    rewriter.eraseOp(op);

    return success();

  }

};


/// Distributes a subgroup-level StoreScatter (xegpu.store) op to

/// lane-level.

///

/// Example 1 (1D, no chunk size):

///   layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>

///   %mask = producer_op : vector<16xi1>

///   %offset = producer_op : vector<16xindex>

///   xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,

///     memref<256xf16>, vector<16xindex>, vector<16xi1>

/// Distributed to:

///   %mask = producer_op : vector<1xi1>

///   %offset = producer_op : vector<1xindex>

///   xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,

///     memref<256xf16>, vector<1xindex>, vector<1xi1>

///

/// Example 2 (2D with chunk size, same mask & offset):

///   layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>

///   xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :

///     vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>

/// Distributed to:

///   xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :

///     vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>

///

/// Example 3 (3D with leading unit dims):

///   layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>

///   %mask = producer_op : vector<1x1x16xi1>

///   %offset = producer_op : vector<1x1x16xindex>

///   xegpu.store %payload, %src[%offset], %mask : vector<1x1x16xf16>,

///     memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1>

/// Distributed to:

///   %mask = producer_op : vector<1x1x1xi1>

///   %offset = producer_op : vector<1x1x1xindex>

///   xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,

///     memref<256xf16>, vector<1xindex>, vector<1xi1>

struct SgToLaneStoreScatter

    : public OpConversionPattern<xegpu::StoreScatterOp> {

  using OpConversionPattern<xegpu::StoreScatterOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(xegpu::StoreScatterOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();

    if (!layout)

      return failure();


    VectorType origValueTy = op.getValueType();

    if (!origValueTy)

      return failure();


    // Check that all leading dimensions are unit dimensions.

    int chunkSize = op.getChunkSize().value_or(1);

    int effectiveVecRank = (chunkSize == 1) ? 1 : 2;

    ArrayRef<int64_t> shape = origValueTy.getShape();

    if (llvm::any_of(shape.take_front(origValueTy.getRank() - effectiveVecRank),

                     [](int64_t d) { return d != 1; }))

      return rewriter.notifyMatchFailure(

          op, "Only unit dimensions allowed for the leading "

              "dimensions of the store vector!");


    auto distValueTyOrFailure =

        xegpu::getDistVecTypeBasedOnLaneLayout(layout, origValueTy);

    if (failed(distValueTyOrFailure))

      return rewriter.notifyMatchFailure(

          op, "unable to compute expected lane vector type from lane layout");


    VectorType distValueTy = distValueTyOrFailure.value();

    VectorType distValueTy1D = VectorType::get({distValueTy.getNumElements()},

                                               distValueTy.getElementType());


    Value distValue = adaptor.getValue();

    if (distValue.getType() != distValueTy1D)

      distValue = castValueTo(rewriter, cast<TypedValue<VectorType>>(distValue),

                              distValueTy1D);


    // Flatten offsets and mask to 1D to match the 1D value type.

    Value distOffsets = adaptor.getOffsets();

    auto distOffsetsTy = cast<VectorType>(distOffsets.getType());

    VectorType offsetsTy1D = VectorType::get({distOffsetsTy.getNumElements()},

                                             distOffsetsTy.getElementType());

    distOffsets = castValueTo(

        rewriter, cast<TypedValue<VectorType>>(distOffsets), offsetsTy1D);


    Value distMask = adaptor.getMask();

    auto distMaskTy = cast<VectorType>(distMask.getType());

    VectorType maskTy1D = VectorType::get({distMaskTy.getNumElements()},

                                          distMaskTy.getElementType());

    distMask =

        castValueTo(rewriter, cast<TypedValue<VectorType>>(distMask), maskTy1D);


    Value distDest = adaptor.getDest();

    xegpu::StoreScatterOp::create(rewriter, op.getLoc(), distValue, distDest,

                                  distOffsets, distMask, op.getChunkSizeAttr(),

                                  op.getL1HintAttr(), op.getL2HintAttr(),

                                  op.getL3HintAttr(), /*layout=*/nullptr,

                                  /*contiguity=*/nullptr);

    rewriter.eraseOp(op);

    return success();

  }

};


/// Distribute a vector::StepOp to lane-level.

/// The layout must have exactly 1 effective lane dimension.

/// We completely resolve the vector::StepOp by computing the lane_data-sized

/// subranges.

struct SgToLaneVectorStep : public OpConversionPattern<vector::StepOp> {

  using OpConversionPattern<vector::StepOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::StepOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr resultLayout =

        xegpu::getTemporaryLayout(op->getResult(0));

    if (!resultLayout || !resultLayout.isForSubgroup())

      return rewriter.notifyMatchFailure(

          op, "the result vector of the step op lacks subgroup layout");


    auto loc = op.getLoc();

    auto stepResultVecTy = op.getResult().getType();

    auto laneShapeOrFailure =

        xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, stepResultVecTy);

    if (failed(laneShapeOrFailure))

      return rewriter.notifyMatchFailure(

          op, "unable to compute lane vector type from the layout");

    VectorType newVecTy = laneShapeOrFailure.value();


    Value laneId = gpu::LaneIdOp::create(rewriter, loc, rewriter.getIndexType(),

                                         /*upperBound=*/mlir::IntegerAttr());

    auto laneDataBlockCoords = resultLayout.computeDistributedCoords(

        rewriter, loc, laneId, stepResultVecTy.getShape());

    if (failed(laneDataBlockCoords))

      return rewriter.notifyMatchFailure(

          op, "failed to compute lane data block coordinates");


    auto laneDataBlockCoordsVec = laneDataBlockCoords.value();

    auto laneDataBlockLength = resultLayout.getEffectiveLaneDataAsInt()[0];

    assert(static_cast<int64_t>(laneDataBlockCoordsVec.size()) ==

           newVecTy.getNumElements() / laneDataBlockLength);

    SmallVector<Value> stepVals;

    // For each lane_data block, reconstruct its sub-range

    // from the range of SG-level vector.step.Example: vector.step

    // {slice<layout<lane_layout=[2,4,2], lane_data=[1,2,1]>, dims=[0,2]>} :

    // vector<16xindex>

    // Each logical lane holds 4 elements as 2 blocks of 2 elements each.

    // The blocks are round-robin distributed, so logical lane id 0

    // holds values [0,1, 8,9].

    for (auto &laneDataBlockCoords : laneDataBlockCoordsVec) {

      auto laneDataBlockStartCoord = laneDataBlockCoords[0];

      stepVals.push_back(laneDataBlockStartCoord);

      for (int i = 1; i < laneDataBlockLength; ++i) {

        auto offset = arith::ConstantIndexOp::create(rewriter, loc, i);

        stepVals.push_back(arith::AddIOp::create(

            rewriter, loc, laneDataBlockStartCoord, offset));

      }

    }

    assert(static_cast<int64_t>(stepVals.size()) == newVecTy.getNumElements() &&

           "Expecting the number of step values to match the number of "

           "elements in the vector");

    auto stepOpVal =

        vector::FromElementsOp::create(rewriter, loc, newVecTy, stepVals);

    rewriter.replaceOp(op, stepOpVal);

    return success();

  }

};


/// Distributes a subgroup-level vector.extract op to lane-level. Only

/// handles sub-vector extraction (result is VectorType, not scalar).

struct SgToLaneVectorExtract : public OpConversionPattern<vector::ExtractOp> {

  using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::ExtractOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    // Only handle vector results (not scalar extraction).

    auto resultType = dyn_cast<VectorType>(op.getType());

    if (!resultType)

      return rewriter.notifyMatchFailure(op, "scalar extract not supported");


    xegpu::DistributeLayoutAttr layout =

        xegpu::getTemporaryLayout(op->getOpResult(0));

    if (!layout || !layout.isForSubgroup())

      return failure();


    // This implementation assumes distribution only happens on the innermost

    // dimension. Verify that lane_layout[0...n-2] are all unit.

    auto laneLayout = layout.getEffectiveLaneLayoutAsInt();

    if (llvm::any_of(ArrayRef<int64_t>(laneLayout).drop_back(1),

                     [](int64_t v) { return v != 1; }))

      return rewriter.notifyMatchFailure(

          op, "only innermost dimension distribution is supported for "

              "vector.extract");


    auto newOp = vector::ExtractOp::create(

        rewriter, op.getLoc(), adaptor.getSource(), op.getMixedPosition());

    rewriter.replaceOp(op, newOp.getResult());

    return success();

  }

};


/// This pattern distributes a subgroup-level ShapeCast op to lane-level.

struct SgToLaneVectorShapeCast

    : public OpConversionPattern<vector::ShapeCastOp> {

  using OpConversionPattern<vector::ShapeCastOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::ShapeCastOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr resultLayout =

        xegpu::getTemporaryLayout(op->getOpResult(0));

    if (!resultLayout || !resultLayout.isForSubgroup())

      return rewriter.notifyMatchFailure(

          op, "the result vector of the shape_cast op lacks subgroup layout");


    auto resultDistTypeOrFailure = xegpu::getDistVecTypeBasedOnLaneLayout(

        resultLayout, op.getResultVectorType());

    if (failed(resultDistTypeOrFailure))

      return rewriter.notifyMatchFailure(

          op, "failed to get distributed vector type for result");


    Value source = adaptor.getSource();

    auto newShapeCast = vector::ShapeCastOp::create(

        rewriter, op.getLoc(), resultDistTypeOrFailure.value(), source);

    rewriter.replaceOp(op, newShapeCast);

    return success();

  }

};


/// Distributes a subgroup-level vector.extract_strided_slice op to

/// lane-level. If the result is distributed, the offsets and sizes are

/// adjusted to match the distributed types.

struct SgToLaneVectorExtractStridedSlice

    : public OpConversionPattern<vector::ExtractStridedSliceOp> {

  using OpConversionPattern<vector::ExtractStridedSliceOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::ExtractStridedSliceOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr resultLayout =

        xegpu::getTemporaryLayout(op->getOpResult(0));

    if (!resultLayout || !resultLayout.isForSubgroup())

      return failure();


    VectorType resultType = op.getType();

    auto distResultTyOrFailure =

        xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, resultType);

    if (failed(distResultTyOrFailure))

      return rewriter.notifyMatchFailure(

          op, "unable to compute distributed vector type from lane layout");

    VectorType distResultTy = *distResultTyOrFailure;


    SmallVector<int64_t> distributedDims =

        getDistributedDims(resultType, distResultTy);


    // Collect updated sizes, offsets, strides. Pad to full source rank.

    int64_t sourceRank = op.getSourceVectorType().getRank();

    SmallVector<Attribute> updatedSizes =

        llvm::map_to_vector(op.getSizes(), [](Attribute attr) { return attr; });

    SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(

        op.getOffsets(), [](Attribute attr) { return attr; });

    SmallVector<Attribute> updatedStrides = llvm::map_to_vector(

        op.getStrides(), [](Attribute attr) { return attr; });

    for (int64_t i = op.getSizes().size(); i < sourceRank; ++i) {

      updatedSizes.push_back(

          rewriter.getI64IntegerAttr(op.getSourceVectorType().getDimSize(i)));

      updatedOffsets.push_back(rewriter.getI64IntegerAttr(0));

      updatedStrides.push_back(rewriter.getI64IntegerAttr(1));

    }


    // If the result is distributed, adjust offsets and sizes in the

    // distributed dimension.

    if (!distributedDims.empty()) {

      if (distributedDims.size() != 1)

        return rewriter.notifyMatchFailure(

            op, "only single dimension distribution is supported");

      int64_t distDim = distributedDims[0];

      const auto *uArch =

          xegpu::uArch::getUArch(xegpu::getChipStr(op).value_or(""));

      if (!uArch)

        return rewriter.notifyMatchFailure(

            op, "target attribute required to determine subgroup size");

      int subgroupSize = uArch->getSubgroupSize();

      auto sourceLayout = xegpu::getTemporaryLayout(op->getOpOperand(0));

      if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())

        return rewriter.notifyMatchFailure(

            op, "source of extract_strided_slice lacks distribution layout");

      int sourceDistrDimSize = op.getSourceVectorType().getShape()[distDim];

      auto laneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();

      // Effective subgroup size needs to be adjusted if laneLayout along

      // the distributed dimension is smaller than subgroup size.

      if (laneLayout[distDim] < subgroupSize &&

          subgroupSize % laneLayout[distDim] == 0)

        subgroupSize = laneLayout[distDim];

      if (sourceDistrDimSize % subgroupSize != 0)

        return rewriter.notifyMatchFailure(

            op, "source size along distributed dim is not a multiple of "

                "subgroup size");

      auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();

      // Only check lane_data for the distributed dimension. Non-distributed

      // dimensions may have non-unit lane_data (e.g., packed layouts).

      if (distDim < static_cast<int64_t>(sourceLaneData.size()) &&

          sourceLaneData[distDim] != 1)

        return rewriter.notifyMatchFailure(

            op, "expecting unit lane data along the distributed dimension");

      int64_t distrDimOffset =

          cast<IntegerAttr>(updatedOffsets[distDim]).getInt();

      if (distrDimOffset % subgroupSize != 0)

        return rewriter.notifyMatchFailure(

            op, "offset along distributed dim is not a multiple of "

                "subgroup size");

      // Adjust sizes and offsets for the distributed dimension.

      updatedSizes[distDim] =

          rewriter.getI64IntegerAttr(distResultTy.getDimSize(distDim));

      updatedOffsets[distDim] =

          rewriter.getI64IntegerAttr(distrDimOffset / subgroupSize);

    }


    auto newOp = vector::ExtractStridedSliceOp::create(

        rewriter, op.getLoc(), distResultTy, adaptor.getSource(),

        ArrayAttr::get(rewriter.getContext(), updatedOffsets),

        ArrayAttr::get(rewriter.getContext(), updatedSizes),

        ArrayAttr::get(rewriter.getContext(), updatedStrides));

    rewriter.replaceOp(op, newOp.getResult());

    return success();

  }

};


/// This pattern distributes a subgroup-level `vector.broadcast` op to

/// lane-level. The pattern supports three cases:

///

/// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input

///    vector must have a slice layout of the result. If the distributed source

///    and target vector types are identical, this lowers to a no-op; otherwise,

///    it remains a broadcast but operates on distributed vectors.

///

/// 2) Broadcast a same-rank vector with identical layouts for source and

///    target: The source vector must have unit dimensions, and lane_data must

///    be unit size for those unit dims. This always lowers to a no-op.

///

/// 3) Broadcast a scalar with no layout: This always lowers to a broadcast

///    from scalar to distributed result type.

///

/// Example 1 (low-rank to high-rank broadcast):

/// ```

///   %0 = "some_op"() {layout_result_0 =

///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,

///     dims = [0]>} : () -> vector<16xf16>

///   %1 = vector.broadcast %0 {layout_result_0 =

///     #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}

///     : vector<16xf16> to vector<16x16xf16>

/// ```

/// is distributed to:

/// ```

///   %0 = "some_op"() : () -> vector<1xf16>

///   %1 = vector.broadcast %0 : vector<1xf16> to vector<16x1xf16>

/// ```

///

/// Example 2 (same-rank broadcast, no-op):

/// ```

///   %0 = "some_op"() {layout_result_0 =

///     #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}

///     : () -> vector<16x1xf16>

///   %1 = vector.broadcast %0 {layout_result_0 =

///     #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}

///     : vector<16x1xf16> to vector<16x16xf16>

/// ```

/// is distributed to (no-op, source already matches distributed result type):

/// ```

///   %0 = "some_op"() : () -> vector<16x1xf16>

///   // broadcast is eliminated, %0 is used directly

/// ```

///

/// Example 3 (scalar to vector broadcast):

/// ```

///   %0 = "some_op"() : () -> f16

///   %1 = vector.broadcast %0 {layout_result_0 =

///     #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}

///     : f16 to vector<16x16xf16>

/// ```

/// is distributed to:

/// ```

///   %0 = "some_op"() : f16

///   %1 = vector.broadcast %0 : f16 to vector<16x1xf16>

/// ```

struct SgToLaneBroadcast : public OpConversionPattern<vector::BroadcastOp> {

  using OpConversionPattern<vector::BroadcastOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::BroadcastOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr resultLayout =

        xegpu::getTemporaryLayout(cast<OpResult>(op.getResult()));

    if (!resultLayout || !resultLayout.isForSubgroup())

      return rewriter.notifyMatchFailure(

          op, "result does not have subgroup distribute layout");


    VectorType destType = op.getResultVectorType();

    VectorType sourceType = dyn_cast<VectorType>(op.getSourceType());


    xegpu::DistributeLayoutAttr sourceLayout =

        xegpu::getTemporaryLayout(op->getOpOperand(0));


    if (sourceType) {

      int64_t rankDiff = destType.getRank() - sourceType.getRank();

      if (rankDiff > 0) {

        // Case 1: Low-rank to high-rank broadcast.

        if (!sourceLayout || !sourceLayout.isSliceOf(resultLayout))

          op.emitWarning(

              "broadcast source layout must be a slice of result layout");

      } else if (rankDiff == 0) {

        // Case 2: Same-rank broadcast.

        auto broadcastUnitDimsSet = op.computeBroadcastedUnitDims();

        SmallVector<int64_t> broadcastUnitDims(broadcastUnitDimsSet.begin(),

                                               broadcastUnitDimsSet.end());

        assert(sourceLayout.isEqualTo(

                   sourceLayout.setUnitDimData(broadcastUnitDims)) &&

               "The sg_data for unit dimensions should be set as 1");

        sourceLayout = sourceLayout.setUnitDimLayout(broadcastUnitDims);

      }

    } else {

      // Case 3: Scalar to vector broadcast.

      if (sourceLayout)

        return rewriter.notifyMatchFailure(

            op, "broadcast from scalar must not have a layout attribute");

    }


    auto destDistType =

        xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, destType);

    if (failed(destDistType))

      return rewriter.notifyMatchFailure(

          op, "failed to distribute the result vector type");


    Value source = adaptor.getSource();

    // If the adapted source already matches the dest dist type, it's a no-op.

    if (source.getType() == destDistType.value()) {

      rewriter.replaceOp(op, source);

      return success();

    }


    auto newOp = vector::BroadcastOp::create(rewriter, op.getLoc(),

                                             destDistType.value(), source);

    rewriter.replaceOp(op, newOp);

    return success();

  }

};


/// Distributes a subgroup-level vector.insert_strided_slice op to

/// lane-level. If the dest is distributed, the offsets are adjusted to

/// match the distributed types.

struct SgToLaneVectorInsertStridedSlice

    : public OpConversionPattern<vector::InsertStridedSliceOp> {

  using OpConversionPattern<vector::InsertStridedSliceOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::InsertStridedSliceOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    xegpu::DistributeLayoutAttr resultLayout =

        xegpu::getTemporaryLayout(op->getOpResult(0));

    if (!resultLayout || !resultLayout.isForSubgroup())

      return failure();


    VectorType destType = op.getDestVectorType();

    auto distDestTyOrFailure =

        xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, destType);

    if (failed(distDestTyOrFailure))

      return rewriter.notifyMatchFailure(

          op, "unable to compute distributed vector type from lane layout");

    VectorType distDestTy = *distDestTyOrFailure;


    SmallVector<int64_t> destDistributedDims =

        getDistributedDims(destType, distDestTy);


    SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(

        op.getOffsets(), [](Attribute attr) { return attr; });


    if (!destDistributedDims.empty()) {

      if (destDistributedDims.size() != 1)

        return rewriter.notifyMatchFailure(

            op, "only single dimension distribution is supported");

      int64_t destDistDim = destDistributedDims[0];


      const auto *uArch =

          xegpu::uArch::getUArch(xegpu::getChipStr(op).value_or(""));

      if (!uArch)

        return rewriter.notifyMatchFailure(

            op, "target attribute required to determine subgroup size");

      int subgroupSize = uArch->getSubgroupSize();


      VectorType srcType = op.getSourceVectorType();

      // The distributed dim must be in the last k (source rank) dims of dest.

      int64_t sourceDistDim =

          destDistDim - (destType.getRank() - srcType.getRank());

      if (sourceDistDim < 0)

        return rewriter.notifyMatchFailure(

            op, "distributed dimension must be in the last k dims of dest");


      auto destLayout = xegpu::getTemporaryLayout(op->getOpOperand(1));

      auto sourceLayout = xegpu::getTemporaryLayout(op->getOpOperand(0));

      if (!destLayout || !sourceLayout ||

          destLayout.getEffectiveLaneLayoutAsInt().empty() ||

          sourceLayout.getEffectiveLaneLayoutAsInt().empty())

        return rewriter.notifyMatchFailure(

            op, "source or dest of insert_strided_slice lacks distribution "

                "layout");


      auto destLaneData = destLayout.getEffectiveLaneDataAsInt();

      auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();

      // Only check lane_data for the distributed dimension. Non-distributed

      // dimensions may have non-unit lane_data (e.g., packed layouts).

      if ((destDistDim < static_cast<int64_t>(destLaneData.size()) &&

           destLaneData[destDistDim] != 1) ||

          (sourceDistDim < static_cast<int64_t>(sourceLaneData.size()) &&

           sourceLaneData[sourceDistDim] != 1))

        return rewriter.notifyMatchFailure(

            op, "expecting unit lane data along the distributed dimension");


      int64_t srcDistrDimSize = srcType.getDimSize(sourceDistDim);

      if (srcDistrDimSize % subgroupSize != 0)

        return rewriter.notifyMatchFailure(

            op, "source distributed dim size is not a multiple of "

                "subgroup size");


      int64_t destDistrDimOffset =

          cast<IntegerAttr>(op.getOffsets()[destDistDim]).getInt();

      if (destDistrDimOffset % subgroupSize != 0)

        return rewriter.notifyMatchFailure(

            op, "offset along distributed dim is not a multiple of "

                "subgroup size");

      // Adjust offset for the distributed dimension.

      updatedOffsets[destDistDim] =

          rewriter.getI64IntegerAttr(destDistrDimOffset / subgroupSize);

    }


    auto newOp = vector::InsertStridedSliceOp::create(

        rewriter, op.getLoc(), distDestTy, adaptor.getValueToStore(),

        adaptor.getDest(),

        ArrayAttr::get(rewriter.getContext(), updatedOffsets), op.getStrides());

    rewriter.replaceOp(op, newOp.getResult());

    return success();

  }

};


/// Distributes a subgroup-level vector.insert op to lane-level. Only

/// handles sub-vector insertion (value to store is VectorType, not scalar).

struct SgToLaneVectorInsert : public OpConversionPattern<vector::InsertOp> {

  using OpConversionPattern<vector::InsertOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::InsertOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    // Only handle vector value-to-store (not scalar insertion).

    auto valueType = dyn_cast<VectorType>(op.getValueToStoreType());

    if (!valueType)

      return rewriter.notifyMatchFailure(op, "scalar insert not supported");


    xegpu::DistributeLayoutAttr layout =

        xegpu::getTemporaryLayout(op->getOpResult(0));

    if (!layout || !layout.isForSubgroup())

      return failure();


    // verify that the outer k dimensions (for offsets)

    // don't have non-unit lane_layout.

    auto laneLayout = layout.getEffectiveLaneLayoutAsInt();

    if (llvm::any_of(ArrayRef<int64_t>(laneLayout).drop_back(1),

                     [](int64_t v) { return v != 1; }))

      return rewriter.notifyMatchFailure(

          op, "only innermost dimension distribution is supported for "

              "vector.insert");


    auto newOp = vector::InsertOp::create(

        rewriter, op.getLoc(), adaptor.getValueToStore(), adaptor.getDest(),

        op.getMixedPosition());

    rewriter.replaceOp(op, newOp.getResult());

    return success();

  }

};


/// Redistributes `src` for a `convert_layout` that changes only the

/// `lane_layout` along the outer (distributed) dimension, shrinking it from

/// `currentLaneNum` to `targetLaneNum` lanes (a partial-subgroup

/// distribution). Because the data is no longer replicated across all lanes,

/// each surviving lane must gather the values that previously lived in the

/// lanes that are dropped. The values are gathered with `gpu.shuffle` and

/// concatenated with the lane-local data using `vector.shuffle`, which doubles

/// the distributed outer dimension when the lane count is halved.

///

/// Only halving the lane count (a factor of two) is currently supported.

/// Returns the redistributed value on success, or failure if `src` cannot be

/// shuffled (e.g. it is not a rank-2 vector or its bit width is not a multiple

/// of 32).

static FailureOr<Value>

shuffleDataAsLaneLayoutChange(ConversionPatternRewriter &rewriter, Location loc,

                              Value src, int64_t currentLaneNum,

                              int64_t targetLaneNum) {

  VectorType srcTy = dyn_cast<VectorType>(src.getType());

  if (!srcTy || srcTy.getRank() != 2)

    return failure();

  // Only halving the lane count (factor of two) is supported for now.

  if (targetLaneNum <= 0 || currentLaneNum != targetLaneNum * 2)

    return failure();

  // gpu.shuffle operates on i32, so the data must be a multiple of 32 bits.

  int64_t vectorBitWidth =

      srcTy.getNumElements() * srcTy.getElementTypeBitWidth();

  if (vectorBitWidth % 32 != 0)

    return failure();


  // A vector cannot be shuffled across lanes directly:

  // -- cast the source to a 1D vector of i32

  // -- create a temp 1D vector of i32 initialized to zero

  // -- for each i32 element:

  // ---- extract it from the source bundle

  // ---- gpu.shuffle to gather the value from the partner lane

  // ---- insert it into the temp bundle

  // -- cast the temp back to the source vector type

  // -- vector.shuffle the source and temp to concatenate along the outer dim

  Type shuffleElemTy = rewriter.getI32Type();

  int64_t numShuffles = vectorBitWidth / 32;

  VectorType shuffleBundleTy = VectorType::get({numShuffles}, shuffleElemTy);

  // Initialize temp to zero.

  Value temp = arith::ConstantOp::create(

      rewriter, loc,

      DenseElementsAttr::get(shuffleBundleTy,

                             IntegerAttr::get(shuffleElemTy, 0)));

  VectorType flatSrcTy =

      VectorType::get({srcTy.getNumElements()}, srcTy.getElementType());

  Value flatSrc = vector::ShapeCastOp::create(rewriter, loc, flatSrcTy, src);

  Value shuffleBundle =

      vector::BitCastOp::create(rewriter, loc, shuffleBundleTy, flatSrc);

  for (int64_t i = 0; i < numShuffles; i++) {

    Value shuffleElem =

        vector::ExtractOp::create(rewriter, loc, shuffleBundle, i);

    shuffleElem = gpu::ShuffleOp::create(rewriter, loc, shuffleElem, 0,

                                         targetLaneNum, gpu::ShuffleMode::UP)

                      .getResult(0);

    temp = vector::InsertOp::create(rewriter, loc, shuffleElem, temp, i);

  }

  temp = vector::BitCastOp::create(rewriter, loc, flatSrcTy, temp);

  temp = vector::ShapeCastOp::create(rewriter, loc, srcTy, temp);


  // Concatenate the lane-local and gathered data along the outer dimension.

  SmallVector<int64_t> indices(srcTy.getShape()[0] * 2);

  std::iota(indices.begin(), indices.end(), 0);

  Value res = vector::ShuffleOp::create(rewriter, loc, src, temp, indices);

  return res;

}


/// Folds a subgroup-level ConvertLayout op with compatible lane layouts.

struct SgToLaneConvertLayout

    : public OpConversionPattern<xegpu::ConvertLayoutOp> {

  using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(xegpu::ConvertLayoutOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    auto inputLayout = op.getEffectiveInputLayout();

    auto targetLayout = op.getTargetLayoutAttr();

    Type valType = op.getResult().getType();


    if (valType.isIntOrFloat()) {

      rewriter.replaceOp(op, op.getSource());

      return success();

    }


    auto resShape = cast<VectorType>(valType).getShape();

    SmallVector<int64_t> resShapeVec(resShape.begin(), resShape.end());


    // Equivalent layouts: the convert_layout is a no-op and folds to its

    // source.

    if (inputLayout.isCompatibleWith(targetLayout, resShapeVec,

                                     xegpu::LayoutKind::Lane)) {

      rewriter.replaceOp(op, adaptor.getSource());

      return success();

    }


    // Handle the special case where the conversion redistributes a value

    // across a fraction of the subgroup: the lane_layout shrinks along the

    // outer (distributed) dimension while lane_data stays the same. Only a

    // pure outer-dimension lane_layout change is supported, so the inner

    // lane_layout must be unit (making the outer dim the only distributed one)

    // and the outer lane_layout must be genuinely distributed (> 1), which

    // also rules out the degenerate [1, 1] layout.

    if (inputLayout.getEffectiveOrderAsInt() ==

            targetLayout.getEffectiveOrderAsInt() &&

        inputLayout.getRank() == 2 && targetLayout.getRank() == 2) {

      auto laneLayout = inputLayout.getEffectiveLaneLayoutAsInt();

      auto targetLaneLayout = targetLayout.getEffectiveLaneLayoutAsInt();

      auto laneData = inputLayout.getEffectiveLaneDataAsInt();

      auto targetLaneData = targetLayout.getEffectiveLaneDataAsInt();

      if (laneLayout.size() == 2 && targetLaneLayout.size() == 2 &&

          laneData == targetLaneData && laneLayout[1] == 1 &&

          targetLaneLayout[1] == 1 && laneLayout[0] > 1 &&

          laneLayout[0] != targetLaneLayout[0]) {

        FailureOr<Value> res = shuffleDataAsLaneLayoutChange(

            rewriter, op.getLoc(), adaptor.getSource(), laneLayout[0],

            targetLaneLayout[0]);

        if (succeeded(res)) {

          rewriter.replaceOp(op, *res);

          return success();

        }

      }

    }


    return rewriter.notifyMatchFailure(

        op, "lowering incompatible convert_layout not yet supported");

  }

};


// Trivially distribute `vector.interleave`

struct SgToLaneVectorInterleave

    : public OpConversionPattern<vector::InterleaveOp> {

  using OpConversionPattern<vector::InterleaveOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::InterleaveOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {


    auto newOp = vector::InterleaveOp::create(

        rewriter, op.getLoc(), adaptor.getLhs(), adaptor.getRhs());

    rewriter.replaceOp(op, newOp.getResult());

    return success();

  }

};


// Trivially distribute `vector.deinterleave`

struct SgToLaneVectorDeinterleave

    : public OpConversionPattern<vector::DeinterleaveOp> {

  using OpConversionPattern<vector::DeinterleaveOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(vector::DeinterleaveOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {


    auto newOp = vector::DeinterleaveOp::create(rewriter, op.getLoc(),

                                                adaptor.getSource());

    rewriter.replaceOp(op, newOp.getResults());

    return success();

  }

};


struct SgToLaneDpasMx : public OpConversionPattern<xegpu::DpasMxOp> {

  using OpConversionPattern<xegpu::DpasMxOp>::OpConversionPattern;


  LogicalResult

  matchAndRewrite(xegpu::DpasMxOp op, OpAdaptor adaptor,

                  ConversionPatternRewriter &rewriter) const override {

    const auto *uArch =

        xegpu::uArch::getUArch(xegpu::getChipStr(op).value_or(""));

    if (!uArch)

      return failure();

    if (!uArch->isSupportedInstruction(

            xegpu::uArch::InstructionKind::SubgroupScaledMatrixMultiplyAcc))

      return rewriter.notifyMatchFailure(

          op, "target uArch does not support scaled subgroup mma");

    // Check if the op has A, B and CD layouts attached.

    auto layoutA = cast<xegpu::LayoutAttr>(op.getLayoutAAttr());

    auto layoutB = cast<xegpu::LayoutAttr>(op.getLayoutBAttr());

    auto layoutCd = cast<xegpu::LayoutAttr>(op.getLayoutCdAttr());

    if (!layoutA || !layoutB || !layoutCd)

      return rewriter.notifyMatchFailure(

          op, "missing required layout attributes for DpasMxOp distribution");


    // Retrieve expected types, according to anchor layouts.

    auto expected1DTypeResult =

        xegpu::getDistributedVectorType(op.getType(), layoutCd);

    auto expected1DTypeA =

        xegpu::getDistributedVectorType(op.getA().getType(), layoutA);

    auto expected1DTypeB =

        xegpu::getDistributedVectorType(op.getB().getType(), layoutB);


    VectorType expected1DTypeScaleA, expected1DTypeScaleB;

    if (op.getScaleA()) {

      auto layoutScaleA = cast<xegpu::LayoutAttr>(op.getLayoutAScaleAttr());

      auto expected1DTypeScaleAOrFailure = xegpu::getDistributedVectorType(

          cast<VectorType>(op.getScaleA().getType()), layoutScaleA);

      if (failed(expected1DTypeScaleAOrFailure))

        return rewriter.notifyMatchFailure(

            op, "failed to calculate expected 1D vector type for scale A");

      expected1DTypeScaleA = expected1DTypeScaleAOrFailure.value();

    }

    if (op.getScaleB()) {

      auto layoutScaleB = cast<xegpu::LayoutAttr>(op.getLayoutBScaleAttr());

      auto expected1DTypeScaleBOrFailure = xegpu::getDistributedVectorType(

          cast<VectorType>(op.getScaleB().getType()), layoutScaleB);

      if (failed(expected1DTypeScaleBOrFailure))

        return rewriter.notifyMatchFailure(

            op, "failed to calculate expected 1D vector type for scale B");

      expected1DTypeScaleB = expected1DTypeScaleBOrFailure.value();

    }


    auto expectedNDTypeResult =

        xegpu::getDistVecTypeBasedOnLaneLayout(layoutCd, op.getType());

    if (failed(expected1DTypeResult) || failed(expected1DTypeA) ||

        failed(expected1DTypeB))

      return rewriter.notifyMatchFailure(

          op,

          "failed to calculate supported workitem 1D vector types for DpasOp "

          "from layouts");

    if (failed(expectedNDTypeResult))

      return rewriter.notifyMatchFailure(

          op, "unable to compute expected workitem vector type for DpasOp from "

              "lane layout");


    // Validate bit widths match uArch packed format requirements

    const auto *uArchInstruction = dyn_cast<

        xegpu::uArch::SubgroupScaledMatrixMultiplyAcc>(uArch->getInstruction(

        xegpu::uArch::InstructionKind::SubgroupScaledMatrixMultiplyAcc));

    assert(uArchInstruction);

    auto wiAType = expected1DTypeA.value();

    auto wiBType = expected1DTypeB.value();

    // Calculate total packed bit width = element bit width * vector size

    unsigned aPackedBitWidth =

        wiAType.getElementTypeBitWidth() * wiAType.getNumElements();

    unsigned bPackedBitWidth =

        wiBType.getElementTypeBitWidth() * wiBType.getNumElements();

    if (aPackedBitWidth % uArchInstruction->getPackedFormatBitSizeA())

      return rewriter.notifyMatchFailure(

          op, "A operand packed bit width must be a multiple of uArch packed "

              "format requirement");

    if (bPackedBitWidth % uArchInstruction->getPackedFormatBitSizeB())

      return rewriter.notifyMatchFailure(

          op, "B operand packed bit width must be a multiple of uArch packed "

              "format requirement");


    auto newOp = xegpu::DpasMxOp::create(

        rewriter, op->getLoc(), expected1DTypeResult.value(),

        castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getA()),

                    expected1DTypeA.value()),

        castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getB()),

                    expected1DTypeB.value()),

        op.getAcc()

            ? castValueTo(rewriter,

                          cast<TypedValue<VectorType>>(adaptor.getAcc()),

                          expected1DTypeResult.value())

            : nullptr,


        op.getScaleA()

            ? castValueTo(rewriter,

                          cast<TypedValue<VectorType>>(adaptor.getScaleA()),

                          expected1DTypeScaleA)

            : nullptr,

        op.getScaleB()

            ? castValueTo(rewriter,

                          cast<TypedValue<VectorType>>(adaptor.getScaleB()),

                          expected1DTypeScaleB)

            : nullptr,

        /** layoutA**/ nullptr,

        /** layoutB**/ nullptr, /** layoutCd**/ nullptr,

        /** layoutAScale**/ nullptr, /** layoutBScale**/ nullptr);

    // Explicitly set the new types to enable correct type materializations.

    rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),

                                       expectedNDTypeResult.value()));

    return success();

  }

};


struct XeGPUSgToLaneDistributePass

    : public xegpu::impl::XeGPUSgToLaneDistributeBase<

          XeGPUSgToLaneDistributePass> {

  void runOnOperation() override;

};


} // namespace


void XeGPUSgToLaneDistributePass::runOnOperation() {


  // Recover temporary operand layouts for usage in patterns.

  Operation *root = getOperation();

  if (!xegpu::recoverTemporaryLayouts(root)) {

    signalPassFailure();

    return;

  }


  // Collect existing UnrealizedConversionCastOps. These must be preserved.

  llvm::SmallSetVector<UnrealizedConversionCastOp, 8> existingCasts;

  root->walk(

      [&](UnrealizedConversionCastOp castOp) { existingCasts.insert(castOp); });

  // Perform a structural type conversion to convert structural ops to have WI

  // types. This will insert UnrealizedConversionCastOps to make the IR

  // valid.

  {

    ConversionTarget target(getContext());

    TypeConverter typeConverter;

    RewritePatternSet patterns(&getContext());

    // Source (N:1) and target (1:1) materializations using

    // UnrealizedConversionCastOp.

    auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,

                              Location loc) -> Value {

      return UnrealizedConversionCastOp::create(builder, loc, type, inputs)

          .getResult(0);

    };

    typeConverter.addSourceMaterialization(materializeCast);

    typeConverter.addTargetMaterialization(materializeCast);

    xegpu::populateXeGPUSgToLaneDistributeTypeConversions(typeConverter, root);

    scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,

                                                         patterns, target);

    xegpu::populateXeGPUSgToLaneDistributeTypeConversionAndLegality(

        typeConverter, patterns, target, root);

    target.addLegalOp<UnrealizedConversionCastOp>();

    (void)applyPartialConversion(root, target, std::move(patterns));

  }

  // Fold cancelling cast chains and erase dead casts.

  xegpu::cleanupUnrealizedConversionCasts(root, existingCasts);

  xegpu::removeTemporaryLayoutAttrs(getOperation());

}


void xegpu::populateXeGPUSgToLaneDistributeTypeConversions(

    TypeConverter &typeConverter, Operation *topLevelOp) {

  // Pass through any type by default; more specific conversions registered

  // below override this for TensorDescType and (distributing) VectorType.

  typeConverter.addConversion([](Type type) -> Type { return type; });

  // For TensorDescType, drop the layout attribute if any.

  typeConverter.addConversion([](TensorDescType type) -> Type {

    if (type.getLayoutAttr()) {

      return type.dropLayouts();

    }

    return type;

  });

  // For VectorType, distribute based on the lane layout (1:1 shape-changing

  // conversion). Uses xegpu::addVectorTypeConversion with a pre-computed

  // map for SCF loop block args (see precomputeLoopBlockArgTypes for the

  // rationale).

  auto getSubShapeAndCount = [](VectorType vecTy,

                                xegpu::DistributeLayoutAttr layout)

      -> std::pair<SmallVector<int64_t>, int> {

    auto distTyOrFailure = getDistVecTypeBasedOnLaneLayout(layout, vecTy);

    if (failed(distTyOrFailure))

      return {{}, 0};

    return {SmallVector<int64_t>(distTyOrFailure->getShape()), 1};

  };

  auto loopArgTypes =

      xegpu::precomputeLoopBlockArgTypes(topLevelOp, getSubShapeAndCount);

  xegpu::addVectorTypeConversion(typeConverter, getSubShapeAndCount,

                                 std::move(loopArgTypes));

}


void xegpu::populateXeGPUSgToLaneDistributeTypeConversionAndLegality(

    TypeConverter &typeConverter, RewritePatternSet &patterns,

    ConversionTarget &target, Operation *topLevelOp) {

  populateXeGPUSgToLaneDistributeTypeConversions(typeConverter, topLevelOp);

  // CreateNdDescOp is legal only if its result type has no layout attribute.

  target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(

      [&](xegpu::CreateNdDescOp op) { return !op.getType().getLayoutAttr(); });

  // Any anchor XeGPU op is legal only if it has no anchor layout.

  target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>([](Operation *op) {

    if (isa<xegpu::ConvertLayoutOp>(op))

      return false;

    auto anchorOp = dyn_cast<AnchorLayoutInterface>(op);

    if (!anchorOp)

      return true;

    return !anchorOp.getAnchorLayout();

  });

  // Arith constants are legal only if they have no temporary layout attribute.

  target.addDynamicallyLegalOp<arith::ConstantOp>(

      [=](arith::ConstantOp op) -> bool {

        // If the result type is not a vector, it's legal.

        if (!isa<VectorType>(op.getResult().getType()))

          return true;

        return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));

      });

  // In math and arith dialects, only handle elementwise ops with a single

  // result and with a result layout attribute.

  target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(

      [=](Operation *op) -> std::optional<bool> {

        // Only handle elementwise mappable ops

        if (!OpTrait::hasElementwiseMappableTraits(op))

          return true;

        // Only handle ops with single vector result

        if (op->getNumResults() != 1)

          return true;


        VectorType resultType =

            dyn_cast<VectorType>(op->getResult(0).getType());

        if (!resultType)

          return true;


        // Check if all operands are vectors of the same shape

        for (Value operand : op->getOperands()) {

          VectorType operandType = dyn_cast<VectorType>(operand.getType());

          if (!operandType || operandType.getShape() != resultType.getShape()) {

            return true;

          }

        }

        return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));

      });

  // vector::ReductionOp is legal only if its source has no distribute layout

  // attribute.

  target.addDynamicallyLegalOp<vector::ReductionOp>(

      [=](vector::ReductionOp op) -> bool {

        auto layout = xegpu::getDistributeLayoutAttr(op.getVector());

        return !layout;

      });

  // vector::MultiDimReductionOp op legality.

  target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(

      [=](vector::MultiDimReductionOp op) -> bool {

        return !isValidSubgroupMultiReductionOp(op);

      });

  target.addDynamicallyLegalOp<vector::CreateMaskOp, vector::ConstantMaskOp,

                               vector::TransposeOp, vector::BitCastOp,

                               vector::ShapeCastOp, vector::StepOp,

                               vector::BroadcastOp>([=](Operation *op) -> bool {

    return !xegpu::getTemporaryLayout(op->getOpResult(0));

  });

  target.addDynamicallyLegalOp<vector::ExtractOp>(

      [=](vector::ExtractOp op) -> bool {

        if (!isa<VectorType>(op.getType()))

          return true;

        return !xegpu::getTemporaryLayout(op->getOpResult(0));

      });

  target.addDynamicallyLegalOp<vector::InsertOp>(

      [=](vector::InsertOp op) -> bool {

        return !xegpu::getTemporaryLayout(op->getOpResult(0));

      });

  target.addDynamicallyLegalOp<vector::ExtractStridedSliceOp>(

      [=](vector::ExtractStridedSliceOp op) -> bool {

        return !xegpu::getTemporaryLayout(op->getOpResult(0));

      });

  target.addDynamicallyLegalOp<vector::InsertStridedSliceOp>(

      [=](vector::InsertStridedSliceOp op) -> bool {

        return !xegpu::getTemporaryLayout(op->getOpResult(0));

      });

  target.addDynamicallyLegalOp<vector::InterleaveOp, vector::DeinterleaveOp>(

      [=](Operation *op) -> bool {

        return !xegpu::getTemporaryLayout(op->getOpResult(0));

      });

  target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });

  patterns.add<

      SgToLaneCreateNdDesc, SgToLaneLoadNd, SgToLaneStoreNd, SgToLaneDpas,

      SgToLaneElementWise, SgToLaneArithConstant, SgToLanePrefetchNd,

      SgToLaneLoadGather, SgToLaneStoreScatter, SgToLaneVectorReduction,

      SgToLaneMultiDimReduction, SgToLaneVectorExtract, SgToLaneVectorInsert,

      SgToLaneVectorExtractStridedSlice, SgToLaneVectorInsertStridedSlice,

      SgToLaneLoadMatrix, SgToLaneStoreMatrix, SgToLaneConvertLayout,

      SgToLaneVectorTranspose, SgToLaneVectorBitcast, SgToLaneVectorStep,

      SgToLaneVectorShapeCast, SgToLaneBroadcast,

      SgToLaneCreateMask<vector::CreateMaskOp>,

      SgToLaneCreateMask<vector::ConstantMaskOp>, SgToLaneVectorDeinterleave,

      SgToLaneVectorInterleave, SgToLaneDpasMx>(typeConverter,

                                                patterns.getContext());

}


indices
indices
Definition AffineAnalysis.cpp:262

success
return success()

Builders.h

BuiltinOps.h

DialectConversion.h

Passes.h

GPUDialect.h

Operation.h

IndexDialect.h

IndexingUtils.h

ValueRange
b ValueRange
Definition LinalgTransformOps.cpp:2158

target
target
Definition LinalgTransformOps.cpp:2155

result
result
Definition LinalgTransformOps.cpp:2153

getContext
b getContext())

MLIRContext.h

Patterns.h

ValueRange.h

Value.h

VectorOps.h

XeGPULayoutImpl.h

XeGPUUtils.h

ConversionPattern

ConversionTarget

TypeConverter

int64_t

llvm::ArrayRef
Definition LLVM.h:40

llvm::SmallVector
Definition LLVM.h:64

mlir::Attribute
Attributes are known-constant values of operations.
Definition Attributes.h:25

mlir::DenseElementsAttr::get
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
Definition BuiltinAttributes.cpp:872

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition Operation.h:87

mlir::Operation::getAttrs
ArrayRef< NamedAttribute > getAttrs()
Return all of the attributes on this operation.
Definition Operation.h:537

mlir::Operation::getResult
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition Operation.h:432

mlir::Operation::getLoc
Location getLoc()
The source location the operation was defined or derived from.
Definition Operation.h:240

mlir::Operation::getName
OperationName getName()
The name of an operation is the key identifier for it.
Definition Operation.h:115

mlir::Operation::walk
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition Operation.h:822

mlir::Operation::getNumResults
unsigned getNumResults()
Return the number of results held by this operation.
Definition Operation.h:429

mlir::RewritePatternSet
Definition PatternMatch.h:822

mlir::RewritePatternSet::getContext
MLIRContext * getContext() const
Definition PatternMatch.h:837

mlir::RewritePatternSet::add
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
Definition PatternMatch.h:861

mlir::StaticTileOffsetRange
A range-style iterator that allows for iterating over the offsets of all potential tiles of size tile...
Definition IndexingUtils.h:376

mlir::TypeRange
This class provides an abstraction over the various different ranges of value types.
Definition TypeRange.h:40

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74

mlir::Type::isIntOrFloat
bool isIntOrFloat() const
Return true if this is an integer (of any signedness) or a float type.
Definition Types.cpp:118

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:389

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition Value.h:105

mlir::arith::ConstantIndexOp::create
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:397

mlir::arith::ConstantIntOp::create
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
Definition ArithOps.cpp:296

mlir::detail::DenseArrayAttrImpl< int64_t >::get
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int64_t > content)

Math.h

MemRef.h

XeGPU.h

Transforms.h

BuiltinAttributes.h

BuiltinTypes.h

mlir::OpTrait::hasElementwiseMappableTraits
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
Definition Operation.cpp:1399

mlir::scf::populateSCFStructuralTypeConversionsAndLegality
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
Definition StructuralTypeConversions.cpp:267

mlir::shape
Definition ShapeMappingAnalysis.h:20

mlir::vector::makeArithReduction
Value makeArithReduction(OpBuilder &b, Location loc, CombiningKind kind, Value v1, Value acc, arith::FastMathFlagsAttr fastmath=nullptr, Value mask=nullptr)
Returns the result value of reducing two scalar/vector values with the corresponding arith operation.

mlir::vector::getAsValues
SmallVector< Value > getAsValues(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > foldResults)
Convert foldResults into Values.
Definition VectorOps.cpp:375

mlir::xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc
@ SubgroupMatrixMultiplyAcc
Definition uArchBase.h:40

mlir::xegpu::uArch::InstructionKind::SubgroupScaledMatrixMultiplyAcc
@ SubgroupScaledMatrixMultiplyAcc
Definition uArchBase.h:42

mlir::xegpu::uArch::getUArch
const uArch * getUArch(llvm::StringRef archName)
Definition uArchCommon.h:24

mlir::xegpu
Definition XeGPU.h:25

mlir::xegpu::populateXeGPUSgToLaneDistributeTypeConversionAndLegality
void populateXeGPUSgToLaneDistributeTypeConversionAndLegality(TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, Operation *topLevelOp)
Defines type conversions and legality for XeGPU subgroup to lane distribution and appends the require...
Definition XeGPUSgToLaneDistribute.cpp:1957

mlir::xegpu::requirePacked
bool requirePacked(const DistributeLayoutAttr layout)
Helper function to check if the layout is packed.

mlir::xegpu::removeTemporaryLayoutAttrs
void removeTemporaryLayoutAttrs(Operation *op)
Removes the temporary layout attributes for each OpOperand and OpResult of the given operation.
Definition XeGPULayoutImpl.cpp:405

mlir::xegpu::LayoutKind::Lane
@ Lane
Definition XeGPU.h:32

mlir::xegpu::subgroupReduction
Value subgroupReduction(Location loc, OpBuilder &builder, Value input, vector::CombiningKind kind, uint32_t size)
Given an input value representing per-lane data, this function returns the result after performing a ...
Definition XeGPUUtils.cpp:549

mlir::xegpu::recoverTemporaryLayouts
bool recoverTemporaryLayouts(Operation *rootOp)
Attach layout attributes to all vector-type operands of operations within the given operation's neste...
Definition XeGPULayoutImpl.cpp:351

mlir::xegpu::getDistVecTypeBasedOnLaneLayout
FailureOr< VectorType > getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout, VectorType originalType)
Helper function to get distributed vector type for a source vector type according to the lane_layout.

mlir::xegpu::lowerToVectorReductions
Value lowerToVectorReductions(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, Location loc, PatternRewriter &rewriter)
Given a src and an acc argumments from a vector::MultiDimReductionOp, lower to a set of vector::Reduc...
Definition XeGPUUtils.cpp:564

mlir::xegpu::requireTranspose
bool requireTranspose(const DistributeLayoutAttr layout, const uArch::uArch *uArch)
Helper function to check if the layout requires a transpose effect.

mlir::xegpu::getDistributeLayoutAttr
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
Definition XeGPUUtils.cpp:149

mlir::xegpu::precomputeLoopBlockArgTypes
DenseMap< Value, SmallVector< Type > > precomputeLoopBlockArgTypes(Operation *topLevelOp, SubShapeAndCountFn getSubShapeAndCount)
Pre-computes distributed VectorType mappings for every value carried through an SCF loop under topLev...
Definition XeGPUUtils.cpp:879

mlir::xegpu::getChipStr
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
Definition XeGPUUtils.cpp:474

mlir::xegpu::addVectorTypeConversion
void addVectorTypeConversion(TypeConverter &converter, SubShapeAndCountFn getSubShapeAndCount, DenseMap< Value, SmallVector< Type > > loopArgTypes)
Adds a context-aware VectorType conversion to converter (1:1 shape-changing or 1:N,...
Definition XeGPUUtils.cpp:945

mlir::xegpu::getTemporaryLayout
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...

mlir::xegpu::populateXeGPUSgToLaneDistributeTypeConversions
void populateXeGPUSgToLaneDistributeTypeConversions(TypeConverter &typeConverter, Operation *topLevelOp)
Define only the type conversions needed for XeGPU subgroup to lane distribution.
Definition XeGPUSgToLaneDistribute.cpp:1927

mlir::xegpu::lowerCrossLaneReductionToShuffles
Value lowerCrossLaneReductionToShuffles(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize, Location loc, PatternRewriter &rewriter)
Lowers cross-lane reductions to shuffle operations on a 2D vector.
Definition XeGPUUtils.cpp:641

mlir::xegpu::cleanupUnrealizedConversionCasts
void cleanupUnrealizedConversionCasts(Operation *root, const llvm::SmallSetVector< UnrealizedConversionCastOp, 8 > &existingCasts)
Cleans up UnrealizedConversionCastOps inserted during SCF structural type conversion and/or XeGPU unr...
Definition XeGPUUtils.cpp:986

mlir::xegpu::addWithRightAligned
SmallVector< OpFoldResult > addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with automatic alignment.
Definition XeGPUUtils.cpp:518

mlir::xegpu::getDistributedVectorType
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Definition XeGPUUtils.cpp:44

mlir
Include the generated interface declarations.
Definition ABIRewriteContext.h:29

mlir::DenseI64ArrayAttr
detail::DenseArrayAttrImpl< int64_t > DenseI64ArrayAttr
Definition BuiltinAttributes.h:765

mlir::computeStrides
SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)
Definition IndexingUtils.h:47

mlir::delinearize
SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)
Given the strides together with a linear index in the dimension space, return the vector-space offset...
Definition IndexingUtils.cpp:97

mlir::TypedValue
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:494

mlir::getAsOpFoldResult
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
Definition StaticValueUtils.cpp:95

mlir::OperationState
This represents an operation in an abstracted form, suitable for use with the builder APIs.
Definition OperationSupport.h:966

mlir::OperationState::addOperands
void addOperands(ValueRange newOperands)
Definition OperationSupport.cpp:206

mlir::OperationState::addAttribute
void addAttribute(StringRef name, Attribute attr)
Add an attribute with the specified name.
Definition OperationSupport.h:1086

mlir::OperationState::addTypes
void addTypes(ArrayRef< Type > newTypes)
Definition OperationSupport.h:1076

mlir::xegpu::uArch::SubgroupScaledMatrixMultiplyAcc
Definition uArchBase.h:406

uArchCommon.h