doxygen/X86Utils_8cpp_source.html

//===- X86Utils.cpp - MLIR Utilities for X86Ops   -------------------------===//

//

// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#include "mlir/Dialect/X86/Utils/X86Utils.h"


#include "mlir/Dialect/Linalg/IR/Linalg.h"

#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"

#include "mlir/Dialect/SCF/IR/SCF.h"

#include "mlir/Dialect/Vector/IR/VectorOps.h"

#include "mlir/IR/BuiltinTypes.h"

#include "mlir/IR/TypeUtilities.h"

#include "mlir/IR/Types.h"


#include "llvm/ADT/TypeSwitch.h"

#include "llvm/Support/Casting.h"


#include "llvm/ADT/ArrayRef.h"

#include <cassert>


namespace mlir {

namespace x86 {


static FailureOr<SmallVector<mlir::utils::IteratorType>>


inferIteratorsFromOutMap(AffineMap map) {

  if (!map.isProjectedPermutation())

    return failure();

  SmallVector<mlir::utils::IteratorType> iterators(

      map.getNumDims(), mlir::utils::IteratorType::reduction);

  for (auto expr : map.getResults())

    if (auto dim = dyn_cast<AffineDimExpr>(expr))

      iterators[dim.getPosition()] = mlir::utils::IteratorType::parallel;

  return iterators;

}


// Returns true if the operation is in VNNI layout.

// Optionally, the check can be constrained to a specific VNNI blocking factor.


bool isInVnniLayout(Operation *op, ArrayRef<AffineMap> indexingMaps,

                    std::optional<unsigned> blockingFactor) {

  // Narrow down type operations - VNNI only applies to contractions.

  FailureOr<linalg::ContractionDimensions> dims =

      linalg::inferContractionDims(indexingMaps);

  if (failed(dims))

    return false;


  auto matA = op->getOperand(0);

  auto matB = op->getOperand(1);

  auto typeA = dyn_cast<ShapedType>(matA.getType());

  auto typeB = dyn_cast<ShapedType>(matB.getType());

  unsigned rankA = typeA.getRank();

  unsigned rankB = typeB.getRank();

  // VNNI format requires at least 1 parallel and 2 reduction dimensions.

  if (rankA < 3 || rankB < 3)

    return false;


  // At least two reduction dimensions are expected:

  // one for the VNNI factor and one for the K dimension

  if (dims->k.size() < 2)

    return false;


  // Validate affine maps - VNNI computation should be defined by the two

  // innermost reduction iterators.

  // The input matrix dimensions layout must match the following:

  //   - matrix A - [...][K/vnniFactor][vnniFactor]

  //   - matrix B - [...][K/vnniFactor][N][vnniFactor]

  auto maybeIters = inferIteratorsFromOutMap(indexingMaps[2] /* outs */);

  if (failed(maybeIters))

    return false;

  SmallVector<mlir::utils::IteratorType> iteratorTypes = *maybeIters;

  AffineMap mapA = indexingMaps[0];

  AffineMap mapB = indexingMaps[1];


  auto vnniDimA = dyn_cast<AffineDimExpr>(mapA.getResult(rankA - 1));

  auto vnniDimB = dyn_cast<AffineDimExpr>(mapB.getResult(rankB - 1));

  if (!vnniDimA || !vnniDimB || vnniDimA != vnniDimB ||

      iteratorTypes[vnniDimA.getPosition()] !=

          mlir::utils::IteratorType::reduction)

    return false;

  auto redDimA = dyn_cast<AffineDimExpr>(mapA.getResult(rankA - 2));

  auto redDimB = dyn_cast<AffineDimExpr>(mapB.getResult(rankB - 3));

  if (!redDimA || !redDimB || redDimA != redDimB ||

      iteratorTypes[redDimA.getPosition()] !=

          mlir::utils::IteratorType::reduction)

    return false;

  auto parallelDimB = dyn_cast<AffineDimExpr>(mapB.getResult(rankB - 2));

  if (!parallelDimB || iteratorTypes[parallelDimB.getPosition()] !=

                           mlir::utils::IteratorType::parallel)

    return false;


  // VNNI factor must be:

  //   - the innermost inputs' dimension

  //   - statically known

  //   - multiple of 2 or equal to the specified factor

  auto vnniDimSize = typeB.getShape().back();

  if (vnniDimSize == ShapedType::kDynamic || vnniDimSize == 0 ||

      vnniDimSize % 2 != 0)

    return false;

  if (typeA.getShape().back() != vnniDimSize)

    return false;

  if (blockingFactor && vnniDimSize != *blockingFactor)

    return false;


  // The split reduction dimension size should also match.

  if (typeA.getShape().end()[-2] != typeB.getShape().end()[-3])

    return false;


  return true;

}


struct ShuffleMasks {

  llvm::ArrayRef<int64_t> maskLo;

  llvm::ArrayRef<int64_t> maskHi;

};


inline ShuffleMasks getShuffleMasks(int64_t nonUnitDimAcc, bool isInt8Avx2) {

  // We only support these two layouts for now.

  assert((nonUnitDimAcc == 8 || nonUnitDimAcc == 16) &&

         "Unsupported nonUnitDimAcc value");


  // Do interleaving between two <8xf32> targeting AVX2.

  static constexpr int64_t maskLo8[] = {0, 8, 1, 9, 2, 10, 3, 11};

  static constexpr int64_t maskHi8[] = {4, 12, 5, 13, 6, 14, 7, 15};


  // Do interleaving between two <8xi32> targeting AVX2.

  static constexpr int64_t maskLo8_avx2_int8[] = {0, 1, 2, 3, 8, 9, 10, 11};

  static constexpr int64_t maskHi8_avx2_int8[] = {4, 5, 6, 7, 12, 13, 14, 15};


  // Shuffle two <16xf32/i32> as below targeting AVX512.

  static constexpr int64_t maskLo16[] = {0, 1, 2, 3, 16, 17, 18, 19,

                                         4, 5, 6, 7, 20, 21, 22, 23};

  static constexpr int64_t maskHi16[] = {8,  9,  10, 11, 24, 25, 26, 27,

                                         12, 13, 14, 15, 28, 29, 30, 31};


  if (nonUnitDimAcc == 16)

    return {maskLo16, maskHi16};


  if (isInt8Avx2)

    return {maskLo8_avx2_int8, maskHi8_avx2_int8};


  return {maskLo8, maskHi8};

}


// This function walks backward from a value to locate its originating

// vector read-like operation (`vector.transfer_read` or `vector.load`).

// It follows simple forwarding through unary ops and across `scf.for`

// loop iter-arguments, while stopping if layout-transforming ops such

// as `shape_cast` or `shuffle` are encountered. The traversal returns

// the read-like defining operation or `nullptr` if no valid source

// is found.


Operation *traceToVectorReadLikeParentOperation(Value v) {

  while (true) {

    // Case 1: Value defined by an operation

    if (Operation *defOp = v.getDefiningOp()) {

      if (isa<vector::TransferReadOp, vector::LoadOp>(defOp))

        return defOp;


      return nullptr;

    }


    // Case 2: BlockArgument (scf.for iter_arg)

    if (auto barg = dyn_cast<BlockArgument>(v)) {

      auto *parentOp = barg.getOwner()->getParentOp();


      if (auto forOp = dyn_cast<scf::ForOp>(parentOp)) {

        unsigned argNum = barg.getArgNumber();


        // arg0 = induction variable (not an iter_arg)

        if (argNum == 0)

          return nullptr;


        unsigned iterIdx = argNum - 1;

        v = forOp.getInitArgs()[iterIdx];

        continue;

      }


      return nullptr;

    }


    return nullptr;

  }

}


// This function recursively traces a value through its uses to find

// a downstream vector write-like operation (`vector.transfer_write`

// or `vector.store`). It transparently follows values across `scf.for`

// and `scf.yield` boundaries while stopping if layout-altering ops such

// as `shape_cast` or `shuffle` are encountered. The traversal returns

// the  matching write-like user. Returns `nullptr` if none is found or

// the value has multiple users.


Operation *traceToVectorWriteLikeUserOperation(Value v) {


  if (v.getNumUses() > 1)

    return nullptr;


  for (OpOperand &use : v.getUses()) {

    Operation *user = use.getOwner();


    // --- TERMINAL OPS ---

    if (isa<vector::TransferWriteOp>(user) || isa<vector::StoreOp>(user))

      return user;


    if (isa<vector::ShapeCastOp, vector::ShuffleOp>(user))

      return nullptr;


    // --- SCF YIELD ---

    if (auto yield = dyn_cast<scf::YieldOp>(user)) {

      Operation *parent = yield->getParentOp();

      unsigned idx = use.getOperandNumber();

      if (auto *res =

              traceToVectorWriteLikeUserOperation(parent->getResult(idx)))

        return res;

      continue;

    }


    // --- SCF FOR ---

    if (auto forOp = dyn_cast<scf::ForOp>(user)) {

      unsigned idx = use.getOperandNumber();

      if (auto *res = traceToVectorWriteLikeUserOperation(forOp.getResult(idx)))

        return res;

      continue;

    }


    // --- GENERIC CASE ---

    for (Value res : user->getResults()) {

      if (auto *found = traceToVectorWriteLikeUserOperation(res))

        return found;

    }

  }


  return nullptr;

}


// This function packs the accumulator of two flat BF16 vector.contract

// operations into VNNI packed and are then replaced in their respective

// contraction ops, enabling post-read layout or packing transformations.

// TODO: replace all use with the packed value along with contration

// and for op.


LogicalResult shuffleAfterReadLikeOp(PatternRewriter &rewriter, Operation *opA,

                                     Operation *opB,

                                     vector::ContractionOp contractA,

                                     vector::ContractionOp contractB,

                                     int64_t nonUnitDimAcc, VectorType accTy) {


  if (!isa<vector::TransferReadOp, vector::LoadOp>(opA) ||

      !isa<vector::TransferReadOp, vector::LoadOp>(opB)) {

    return failure();

  }


  Operation *insertAfter = opA->isBeforeInBlock(opB) ? opB : opA;


  rewriter.setInsertionPointAfter(insertAfter);

  Location loc = insertAfter->getLoc();


  auto elemTy = accTy.getElementType();

  auto flatTy = VectorType::get(nonUnitDimAcc, elemTy);


  auto castA =

      vector::ShapeCastOp::create(rewriter, loc, flatTy, opA->getResult(0));

  auto castB =

      vector::ShapeCastOp::create(rewriter, loc, flatTy, opB->getResult(0));


  auto masks = getShuffleMasks(

      nonUnitDimAcc, (elemTy.isSignlessInteger(32) && nonUnitDimAcc == 8));


  auto shuffleLo = vector::ShuffleOp::create(rewriter, loc, flatTy, castA,

                                             castB, masks.maskLo);

  auto shuffleHi = vector::ShuffleOp::create(rewriter, loc, flatTy, castA,

                                             castB, masks.maskHi);


  auto newAccA = vector::ShapeCastOp::create(rewriter, loc, accTy, shuffleLo);

  auto newAccB = vector::ShapeCastOp::create(rewriter, loc, accTy, shuffleHi);


  rewriter.replaceUsesWithIf(

      opA->getResult(0), newAccA.getResult(), [&](OpOperand &use) {

        return isa<vector::ContractionOp, scf::ForOp>(use.getOwner());

      });


  rewriter.replaceUsesWithIf(

      opB->getResult(0), newAccB.getResult(), [&](OpOperand &use) {

        return isa<vector::ContractionOp, scf::ForOp>(use.getOwner());

      });


  return success();

}


// This function shuffles the vectors written by vector.contract operation

// as a flat layout structure before they are stored.


LogicalResult shuffleBeforeWriteLikeOp(PatternRewriter &rewriter,

                                       Operation *opA, Operation *opB,

                                       int64_t nonUnitDimAcc,

                                       VectorType accTy) {

  // Helper to extract vector operand from write-like ops

  auto getWrittenVector = [](Operation *op) -> Value {

    if (auto write = dyn_cast<vector::TransferWriteOp>(op))

      return write.getVector();

    if (auto store = dyn_cast<vector::StoreOp>(op))

      return store.getValueToStore();

    return nullptr;

  };


  Value vecA = getWrittenVector(opA);

  Value vecB = getWrittenVector(opB);


  if (!vecA || !vecB)

    return failure();


  // Decide insertion point and location

  Operation *insertBefore = opA->isBeforeInBlock(opB) ? opA : opB;


  rewriter.setInsertionPoint(insertBefore);

  Location loc = insertBefore->getLoc();


  auto elemTy = accTy.getElementType();

  auto flatTy = VectorType::get(nonUnitDimAcc, elemTy);


  // Flatten vectors

  auto castA = vector::ShapeCastOp::create(rewriter, loc, flatTy, vecA);

  auto castB = vector::ShapeCastOp::create(rewriter, loc, flatTy, vecB);


  // TODO: derive shuffle masks instead of hard-coding

  auto masks = getShuffleMasks(

      nonUnitDimAcc, (elemTy.isSignlessInteger(32) && nonUnitDimAcc == 8));


  auto shuffledLo = vector::ShuffleOp::create(rewriter, loc, flatTy, castA,

                                              castB, masks.maskLo);

  auto shuffledHi = vector::ShuffleOp::create(rewriter, loc, flatTy, castA,

                                              castB, masks.maskHi);


  // Cast back to accumulator type

  auto newVecA = vector::ShapeCastOp::create(rewriter, loc, accTy, shuffledLo);

  auto newVecB = vector::ShapeCastOp::create(rewriter, loc, accTy, shuffledHi);


  // Update write operands in place via the rewriter to notify it of changes.

  rewriter.modifyOpInPlace(opA,

                           [&]() { opA->setOperand(0, newVecA.getResult()); });

  rewriter.modifyOpInPlace(opB,

                           [&]() { opB->setOperand(0, newVecB.getResult()); });


  return success();

}


// Return true if vector.contract operations matches on below conditions:

//  (1) - the unitDim operand Lhs or Rhs should be same,

//  (2) - the defining source memref should be same for nonUnitDim

//  operation,

//  (3) - the nonUnit dim offset difference between the

//  vector.contracts should be 8 or 16.


bool validatePairVectorContract(vector::ContractionOp contractOp,

                                vector::ContractionOp pairContOp,

                                bool rhsHasMultipleNonUnitDims,

                                int64_t nonUnitDimValue) {

  if (contractOp == pairContOp)

    return false;


  if (rhsHasMultipleNonUnitDims &&

      !(contractOp.getLhs() == pairContOp.getLhs()))

    return false;


  if (!rhsHasMultipleNonUnitDims &&

      !(contractOp.getRhs() == pairContOp.getRhs()))

    return false;


  auto nonUnitOperand =

      rhsHasMultipleNonUnitDims ? contractOp.getRhs() : contractOp.getLhs();

  auto nonUnitOperandPairContOp =

      rhsHasMultipleNonUnitDims ? pairContOp.getRhs() : pairContOp.getLhs();


  Value srcBuff;

  SmallVector<OpFoldResult> indexVals;

  llvm::TypeSwitch<Operation *>(nonUnitOperand.getDefiningOp())

      .Case<vector::TransferReadOp, vector::LoadOp>([&](auto readOp) {

        srcBuff = readOp.getOperand(0);

        indexVals = SmallVector<OpFoldResult>(readOp.getIndices().begin(),

                                              readOp.getIndices().end());

      })

      .Case<vector::ShapeCastOp>([&](vector::ShapeCastOp op) {

        srcBuff = op.getSource();

        indexVals.clear();

      });


  Value srcBuffPairContOp;

  SmallVector<OpFoldResult> indexValsPairContOp;

  llvm::TypeSwitch<Operation *>(nonUnitOperandPairContOp.getDefiningOp())

      .Case<vector::TransferReadOp, vector::LoadOp>([&](auto readOp) {

        srcBuffPairContOp = readOp.getOperand(0);

        indexValsPairContOp = SmallVector<OpFoldResult>(

            readOp.getIndices().begin(), readOp.getIndices().end());

      })

      .Case<vector::ShapeCastOp>([&](vector::ShapeCastOp op) {

        srcBuffPairContOp = op.getSource();

        indexVals.clear();

      });


  if (!srcBuff || !srcBuffPairContOp)

    return false;


  auto shuffleLw = srcBuff.getDefiningOp<vector::ShuffleOp>();

  auto shuffleHw = srcBuffPairContOp.getDefiningOp<vector::ShuffleOp>();


  if (shuffleLw && shuffleHw)

    return shuffleLw.getV1() == shuffleHw.getV1() &&

           shuffleLw.getV2() == shuffleHw.getV2();


  if (srcBuff != srcBuffPairContOp)

    return false;


  bool oneConstantOffset = false;

  for (size_t i = 0; i < indexVals.size(); i++) {


    if (indexVals[i] == indexValsPairContOp[i])

      continue;


    auto v0 = getConstantIntValue(indexVals[i]);

    auto v1 = getConstantIntValue(indexValsPairContOp[i]);


    if (!v0 || !v1)

      return false;


    if ((*v1 - *v0) != nonUnitDimValue)

      return false;


    oneConstantOffset = true;

  }


  return oneConstantOffset;

}


} // namespace x86

} // namespace mlir

success
return success()

Types.h

LinalgInterfaces.h

TypeUtilities.h

VectorOps.h

X86Utils.h

int64_t

llvm::ArrayRef
Definition LLVM.h:40

llvm::SmallVector
Definition LLVM.h:64

llvm::TypeSwitch
Definition LLVM.h:76

mlir::AffineMap
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition AffineMap.h:46

mlir::AffineMap::isProjectedPermutation
bool isProjectedPermutation(bool allowZeroInResults=false) const
Returns true if the AffineMap represents a subset (i.e.
Definition AffineMap.cpp:615

mlir::AffineMap::getNumDims
unsigned getNumDims() const
Definition AffineMap.cpp:392

mlir::AffineMap::getResults
ArrayRef< AffineExpr > getResults() const
Definition AffineMap.cpp:405

mlir::AffineMap::getResult
AffineExpr getResult(unsigned idx) const
Definition AffineMap.cpp:409

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition Builders.h:400

mlir::OpBuilder::setInsertionPointAfter
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition Builders.h:414

mlir::OpOperand
This class represents an operand of an operation.
Definition Value.h:254

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition Operation.h:87

mlir::Operation::getOperand
Value getOperand(unsigned idx)
Definition Operation.h:375

mlir::Operation::setOperand
void setOperand(unsigned idx, Value value)
Definition Operation.h:376

mlir::Operation::isBeforeInBlock
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
Definition Operation.cpp:379

mlir::Operation::getResult
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition Operation.h:432

mlir::Operation::getLoc
Location getLoc()
The source location the operation was defined or derived from.
Definition Operation.h:240

mlir::Operation::getParentOp
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
Definition Operation.h:251

mlir::Operation::getResults
result_range getResults()
Definition Operation.h:440

mlir::PatternRewriter
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition PatternMatch.h:799

mlir::RewriterBase::replaceUsesWithIf
virtual void replaceUsesWithIf(Value from, Value to, function_ref< bool(OpOperand &)> functor, bool *allUsesReplaced=nullptr)
Find uses of from and replace them with to if the functor returns true.
Definition PatternMatch.cpp:297

mlir::RewriterBase::modifyOpInPlace
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
Definition PatternMatch.h:644

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96

mlir::Value::getUses
use_range getUses() const
Returns a range of all uses, which is useful for iterating over all uses.
Definition Value.h:188

mlir::Value::getNumUses
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:52

mlir::Value::getDefiningOp
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition Value.cpp:18

Linalg.h

SCF.h

BuiltinTypes.h

mlir::linalg::inferContractionDims
FailureOr< ContractionDimensions > inferContractionDims(LinalgOp linalgOp)
Find at least 2 parallel (m and n) and 1 reduction (k) dimension candidates that form a matmul subcom...
Definition LinalgInterfaces.cpp:501

mlir::x86
Definition X86TransformOps.h:25

mlir::x86::shuffleBeforeWriteLikeOp
LogicalResult shuffleBeforeWriteLikeOp(PatternRewriter &rewriter, Operation *opA, Operation *opB, int64_t nonUnitDimAcc, VectorType accTy)
Definition X86Utils.cpp:292

mlir::x86::traceToVectorWriteLikeUserOperation
Operation * traceToVectorWriteLikeUserOperation(Value v)
Definition X86Utils.cpp:194

mlir::x86::inferIteratorsFromOutMap
static FailureOr< SmallVector< mlir::utils::IteratorType > > inferIteratorsFromOutMap(AffineMap map)
Definition X86Utils.cpp:29

mlir::x86::isInVnniLayout
bool isInVnniLayout(Operation *op, llvm::ArrayRef< AffineMap > indexingMaps, std::optional< unsigned > blockingFactor=std::nullopt)
Definition X86Utils.cpp:42

mlir::x86::traceToVectorReadLikeParentOperation
Operation * traceToVectorReadLikeParentOperation(Value v)
Definition X86Utils.cpp:154

mlir::x86::getShuffleMasks
ShuffleMasks getShuffleMasks(int64_t nonUnitDimAcc, bool isInt8Avx2)
Definition X86Utils.cpp:119

mlir::x86::shuffleAfterReadLikeOp
LogicalResult shuffleAfterReadLikeOp(PatternRewriter &rewriter, Operation *opA, Operation *opB, vector::ContractionOp contractA, vector::ContractionOp contractB, int64_t nonUnitDimAcc, VectorType accTy)
Definition X86Utils.cpp:242

mlir::x86::validatePairVectorContract
bool validatePairVectorContract(vector::ContractionOp contractOp, vector::ContractionOp pairContOp, bool rhsHasMultipleNonUnitDims, int64_t nonUnitDimValue)
Definition X86Utils.cpp:352

mlir
Include the generated interface declarations.
Definition ABIRewriteContext.h:29

mlir::getConstantIntValue
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
Definition StaticValueUtils.cpp:148

mlir::x86::ShuffleMasks
Definition X86Utils.cpp:114

mlir::x86::ShuffleMasks::maskHi
llvm::ArrayRef< int64_t > maskHi
Definition X86Utils.cpp:116

mlir::x86::ShuffleMasks::maskLo
llvm::ArrayRef< int64_t > maskLo
Definition X86Utils.cpp:115