35#include "llvm/ADT/ArrayRef.h"
36#include "llvm/ADT/STLExtras.h"
37#include "llvm/ADT/SmallVector.h"
41#define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE
42#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
46#define DEBUG_TYPE "xegpu-subgroup-distribute"
47#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
52 "resolve_simt_type_mismatch";
65static constexpr unsigned regularPatternBenefit = 1;
66static constexpr unsigned highPatternBenefit = 2;
81static FailureOr<VectorType>
82getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
83 VectorType originalType) {
86 assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
87 "Expecting a valid layout.");
89 layout.getEffectiveLaneLayoutAsInt();
90 assert(
static_cast<size_t>(originalType.getRank()) >=
91 effectiveLaneLayout.size() &&
92 "Rank of the original vector type should be greater or equal to the "
93 "size of the lane layout to distribute the vector type.");
97 unsigned distributionStart =
98 originalType.getRank() - effectiveLaneLayout.size();
99 for (
auto [i, dim] : llvm::enumerate(originalType.getShape())) {
100 if (i < distributionStart)
104 if (dim % effectiveLaneLayout[i - distributionStart] != 0)
106 distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
108 return VectorType::get(distributedShape, originalType.getElementType());
126static Value resolveDistributedTy(
Value orig, T expected,
129 if (orig.
getType() == expected)
132 if (isa<VectorType>(orig.
getType())) {
134 vector::ShapeCastOp::create(rewriter, orig.
getLoc(), expected, orig);
135 return castOp.getResult();
139 if (isa<xegpu::TensorDescType>(orig.
getType())) {
140 auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.
getLoc(),
143 return castOp.getResult(0);
145 llvm_unreachable(
"Unsupported type for reconciliation");
152static bool requirePacked(
const xegpu::LayoutAttr layout) {
155 auto laneData = layout.getEffectiveLaneDataAsInt();
156 if (laneData.size() != 2)
158 return laneData[0] != 1;
162static bool requireTranspose(
const xegpu::LayoutAttr layout,
171 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
172 if (laneLayout.size() != 2)
207 gpuFuncOp,
"Subgroup distribution requires target attribute attached "
208 "to set the warp size");
210 if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](
Operation &op) {
211 return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
215 if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](
Operation &op) {
216 return isa<gpu::WarpExecuteOnLane0Op>(op);
221 llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
224 llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
226 auto newGpuFunc = gpu::GPUFuncOp::create(
227 rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
229 privateAttributionsTypes);
230 newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
234 auto laneId = gpu::LaneIdOp::create(
236 mlir::IntegerAttr());
237 ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
238 auto warpOp = gpu::WarpExecuteOnLane0Op::create(
239 rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
241 newGpuFunc.getArgumentTypes());
242 Block &warpBodyBlock = warpOp.getBodyRegion().
front();
245 cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
247 gpu::YieldOp::create(rewriter, origRetunOp.getLoc(),
248 origRetunOp.getOperands());
252 warpOp.getBodyRegion().begin());
256 gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
257 rewriter.
replaceOp(gpuFuncOp, newGpuFunc);
295 using gpu::WarpDistributionPattern::WarpDistributionPattern;
296 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
299 getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
302 warpOp,
"warp result is not a xegpu::CreateNdDesc op");
306 xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
309 descOp,
"the tensor descriptor lacks layout attribute");
311 if (descOp.getMixedOffsets().size())
313 descOp,
"xegpu::CreateNdDescOp must not have offsets");
317 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
318 rewriter, warpOp, descOp->getOperands(),
319 descOp.getOperandTypes(), newRetIndices);
322 newRetIndices, [&](
size_t i) {
return newWarpOp.getResult(i); });
324 xegpu::TensorDescType distributedTensorDescTy =
325 descOp.getType().dropLayouts();
327 Value newDescOp = xegpu::CreateNdDescOp::create(
328 rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
331 Value distributedVal = newWarpOp.getResult(operandIdx);
334 resolveDistributedTy(newDescOp, distributedVal.
getType(), rewriter);
373 using gpu::WarpDistributionPattern::WarpDistributionPattern;
374 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
376 gpu::YieldOp yield = warpOp.getTerminator();
378 auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
386 "the store op must have offsets");
390 llvm::map_range(offsetsAsValues, [](
Value v) {
return v.
getType(); }));
391 xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
392 xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
395 storeOp,
"the source tensor descriptor lacks layout attribute");
397 FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
398 getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
399 if (failed(distributedTypeByWarpOpOrFailure))
401 "Failed to distribute the type");
402 VectorType distributedTypeByWarpOp =
403 distributedTypeByWarpOpOrFailure.value();
407 storeOp.getTensorDesc()};
409 newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
410 newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
411 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
412 rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
422 FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
424 if (failed(storeNdDistributedValueTyOrFailure))
426 storeOp,
"Failed to get distributed vector type for the store op");
427 newStoreOperands.push_back(resolveDistributedTy(
428 newWarpOp.getResult(newRetIndices[0]),
429 storeNdDistributedValueTyOrFailure.value(), rewriter));
432 xegpu::TensorDescType distributedTensorDescTy =
433 storeOp.getTensorDescType().dropLayouts();
434 newStoreOperands.push_back(
435 resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
436 distributedTensorDescTy, rewriter));
438 for (
size_t i = 2; i < newRetIndices.size(); ++i)
439 newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
442 xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(),
TypeRange{},
443 newStoreOperands, storeOp->getAttrs());
487 using gpu::WarpDistributionPattern::WarpDistributionPattern;
488 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
491 if (!isa<xegpu::LoadNdOp>(op))
496 gpu::YieldOp yield = warpOp.getTerminator();
497 return yield->getPrevNode() == op;
502 warpOp,
"warp result is not a xegpu::LoadNd op");
508 loadOp,
"xegpu::LoadNdOp require target attribute attached to "
509 "determine transpose "
514 SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
517 "the load op must have offsets");
518 SmallVector<Value> offsetsAsValues =
520 SmallVector<Type> offsetTypes = llvm::to_vector(
521 llvm::map_range(offsetsAsValues, [](Value v) {
return v.
getType(); }));
523 xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
524 xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
527 loadOp,
"the source tensor descriptor lacks layout attribute");
530 VectorType distributedTypeByWarpOp =
531 cast<VectorType>(warpOp.getResult(operandIdx).getType());
533 SmallVector<size_t> newRetIndices;
534 SmallVector<Value> newYieldedValues = {loadOp.getTensorDesc()};
535 SmallVector<Type> newYieldedTypes = {tensorDescTy};
536 newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
537 newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
538 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
539 rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
544 FailureOr<VectorType> loadNdDistValueTyOrFailure =
546 if (
failed(loadNdDistValueTyOrFailure))
548 loadOp,
"Failed to get distributed vector type for the load op");
549 xegpu::TensorDescType distributedTensorDescTy =
550 loadOp.getTensorDescType().dropLayouts();
553 SmallVector<Value> newLoadOperands{
554 resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]),
555 distributedTensorDescTy, rewriter)};
557 for (
size_t i = 1; i < newRetIndices.size(); ++i)
558 newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
559 auto newLoadOp = xegpu::LoadNdOp::create(
560 rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
561 newLoadOperands, loadOp->getAttrs());
564 newLoadOp.setPacked(requirePacked(layout));
566 if (requireTranspose(layout, uArch))
567 newLoadOp.setTranspose(
569 Value distributedVal = newWarpOp.getResult(operandIdx);
573 Value tyResolvedVal = resolveDistributedTy(
574 newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
615 using gpu::WarpDistributionPattern::WarpDistributionPattern;
616 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
617 PatternRewriter &rewriter)
const override {
618 OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
621 "warp result is not a xegpu::Dpas op");
629 xegpu::LayoutAttr layoutA =
630 dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
631 xegpu::LayoutAttr layoutB =
632 dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutBName);
633 xegpu::LayoutAttr layoutOut =
634 dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
635 if (!layoutA || !layoutB || !layoutOut)
638 "the xegpu::Dpas op lacks layout attribute for A, B or output");
640 FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
641 getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
642 FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
643 getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
644 FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
645 getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
646 if (
failed(distLhsTypeByWarpOpOrFailure) ||
647 failed(distRhsTypeByWarpOpOrFailure) ||
648 failed(distResultTypeByWarpOpOrFailure))
651 "Failed to distribute the A, B or output types in xegpu::Dpas op");
653 llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
655 llvm::SmallVector<Type, 3> newYieldTypes{
656 distLhsTypeByWarpOpOrFailure.value(),
657 distRhsTypeByWarpOpOrFailure.value()};
659 if (dpasOp.getAcc()) {
660 newYieldValues.push_back(dpasOp.getAcc());
661 newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
664 SmallVector<size_t> newRetIndices;
665 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
666 rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
668 FailureOr<VectorType> expectedDistLhsTyOrFailure =
670 FailureOr<VectorType> expectedDistRhsTyOrFailure =
672 FailureOr<VectorType> expectedDistResultTyOrFailure =
674 if (
failed(expectedDistLhsTyOrFailure) ||
675 failed(expectedDistRhsTyOrFailure) ||
676 failed(expectedDistResultTyOrFailure))
679 "Failed to get distributed vector type for the dpas operands.");
682 SmallVector<Value> newDpasOperands;
683 SmallVector<VectorType> newDpasOperandExpectedTypes;
686 newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
687 newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
688 VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
690 newDpasOperandExpectedTypes.push_back(distributedResultTy);
692 for (
unsigned i = 0; i < newRetIndices.size(); i++) {
693 newDpasOperands.push_back(
694 resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
695 newDpasOperandExpectedTypes[i], rewriter));
697 auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
698 distributedResultTy, newDpasOperands,
701 Value distributedVal = newWarpOp.getResult(operandIdx);
704 resolveDistributedTy(newDpasOp.getResult(),
705 distResultTypeByWarpOpOrFailure.value(), rewriter);
740 using gpu::WarpDistributionPattern::WarpDistributionPattern;
741 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
742 PatternRewriter &rewriter)
const override {
743 gpu::YieldOp yield = warpOp.getTerminator();
744 Operation *lastNode = yield->getPrevNode();
745 auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
749 SmallVector<OpFoldResult> offsets = prefetchOp.getMixedOffsets();
753 "the prefetch op must have offsets");
754 SmallVector<Value> offsetsAsValues =
756 SmallVector<Type> offsetTypes = llvm::to_vector(
757 llvm::map_range(offsetsAsValues, [](Value v) {
return v.
getType(); }));
759 xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
762 prefetchOp,
"the source tensor descriptor lacks layout attribute");
764 SmallVector<Value> newYieldValues = {prefetchOp.getTensorDesc()};
765 SmallVector<Type> newYieldTypes = {prefetchOp.getTensorDescType()};
766 newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
767 newYieldTypes.append(offsetTypes.begin(), offsetTypes.end());
768 SmallVector<size_t> newRetIndices;
769 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
770 rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
773 xegpu::TensorDescType newTensorDescTy =
774 prefetchOp.getTensorDescType().dropLayouts();
776 SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
777 newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
779 for (
size_t i = 1; i < newRetIndices.size(); ++i)
780 newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
781 xegpu::PrefetchNdOp::create(rewriter, newWarpOp.getLoc(),
TypeRange{},
782 newPrefetchOperands, prefetchOp->getAttrs());
792 using gpu::WarpDistributionPattern::WarpDistributionPattern;
793 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
794 PatternRewriter &rewriter)
const override {
795 gpu::YieldOp yield = warpOp.getTerminator();
796 Operation *lastNode = yield->getPrevNode();
798 auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
803 gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
804 barrierOp->getResultTypes(),
805 barrierOp->getOperands(), barrierOp->getAttrs());
836 using gpu::WarpDistributionPattern::WarpDistributionPattern;
837 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
838 PatternRewriter &rewriter)
const override {
839 Operation *lastNode = warpOp.getTerminator()->getPrevNode();
840 auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
843 auto offsets = storeScatterOp.getOffsets();
844 if (!offsets || !isa<VectorType>(offsets.getType()))
846 storeScatterOp,
"Store op must have a vector of offsets argument");
847 VectorType offsetsTy = cast<VectorType>(offsets.getType());
848 VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
849 if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
851 "Expected 1D offsets and mask vector");
852 VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
853 if (storeVecTy.getRank() > 2)
855 storeScatterOp,
"Expected at most 2D result at SG level");
857 std::string layoutPayloadName =
859 std::string layoutOffsetsName =
861 std::string layoutMaskName =
864 xegpu::LayoutAttr layoutPayload =
865 storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutPayloadName);
866 xegpu::LayoutAttr layoutOffsets =
867 storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
868 xegpu::LayoutAttr layoutMask =
869 storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
871 FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
872 getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
873 FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
874 getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
875 FailureOr<VectorType> distMaskByWarpOpOrFailure =
876 getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
877 if (
failed(distStoreVecByWarpOpOrFailure) ||
878 failed(distOffsetsByWarpOpOrFailure) ||
879 failed(distMaskByWarpOpOrFailure)) {
882 "Some vector operands have no layouts, using defaults instead.");
885 VectorType distPayloadTyByWarpOp = distStoreVecByWarpOpOrFailure.value();
887 VectorType expectedPayloadTy =
888 VectorType::get({distPayloadTyByWarpOp.getNumElements()},
889 distPayloadTyByWarpOp.getElementType());
891 SmallVector<size_t> newRetIndices;
892 SmallVector<Value> operands = storeScatterOp->getOperands();
893 SmallVector<Type> operandTypesToYield = {
894 distPayloadTyByWarpOp, operands[1].getType(),
895 distOffsetsByWarpOpOrFailure.value(),
896 distMaskByWarpOpOrFailure.value()};
898 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
899 rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
900 SmallVector<Value> newStoreScatterOpOperands = llvm::map_to_vector(
901 newRetIndices, [&](
size_t idx) {
return newWarpOp.getResult(idx); });
905 newStoreScatterOpOperands[0] = resolveDistributedTy(
906 newStoreScatterOpOperands[0], expectedPayloadTy, rewriter);
907 xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
908 rewriter, newWarpOp.getLoc(),
TypeRange{}, newStoreScatterOpOperands,
909 storeScatterOp->getAttrs());
911 rewriter.
eraseOp(storeScatterOp);
921 layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
924 assert(maybeCoords.value().size() == 1 &&
925 "Expected one set of distributed offsets");
929 newCoods = llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
935 using gpu::WarpDistributionPattern::WarpDistributionPattern;
936 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
937 PatternRewriter &rewriter)
const override {
938 gpu::YieldOp yield = warpOp.getTerminator();
939 Operation *lastNode = yield->getPrevNode();
940 auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
944 OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
945 return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
947 if (!producedByLastLoad)
949 warpOp,
"The last op is not xegpu::LoadMatrixOp");
952 VectorType sgPayloadTy =
953 dyn_cast<VectorType>(matrixOp.getResult().getType());
954 VectorType warpResultTy =
955 cast<VectorType>(warpOp.getResult(operandIdx).getType());
958 matrixOp,
"the matrix op payload must be a vector type");
960 auto loc = matrixOp.getLoc();
961 auto offsets = matrixOp.getMixedOffsets();
964 "the load op must have offsets");
965 SmallVector<Value> offsetsAsValues =
968 auto layout = matrixOp.getLayoutAttr();
971 matrixOp,
"the matrix operation lacks layout attribute");
973 FailureOr<VectorType> distPayloadByWarpOpOrFailure =
974 getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
975 if (
failed(distPayloadByWarpOpOrFailure))
977 matrixOp,
"Failed to distribute matrix op payload based on layout.");
979 SmallVector<Value> operands = {matrixOp.getMemDesc()};
980 const unsigned offsetsStartIdx = operands.size();
981 operands.append(offsetsAsValues);
983 SmallVector<Type> operandTypes = llvm::to_vector(
984 llvm::map_range(operands, [](Value v) {
return v.
getType(); }));
986 SmallVector<size_t> newRetIndices;
987 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
988 rewriter, warpOp, operands, operandTypes, newRetIndices);
989 SmallVector<Value> newOperands = llvm::map_to_vector(
990 newRetIndices, [&](
size_t idx) {
return newWarpOp.getResult(idx); });
992 SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
993 std::fill(newConstOffsets.begin(), newConstOffsets.end(),
994 ShapedType::kDynamic);
998 ValueRange(newOperands).drop_front(offsetsStartIdx);
1000 SmallVector<Value> newCoords = currentOffsets;
1003 if (!matrixOp.getSubgroupBlockIoAttr()) {
1004 newCoords = computeDistributedCoordinatesForMatrixOp(
1005 rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
1008 xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
1009 rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
1010 newOperands[0],
ValueRange(newCoords), newConstOffsetsAttr,
1011 matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
1014 newWarpOp.getResult(operandIdx),
1015 resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
1022 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1023 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1024 PatternRewriter &rewriter)
const override {
1025 gpu::YieldOp yield = warpOp.getTerminator();
1026 Operation *lastNode = yield->getPrevNode();
1027 auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
1031 VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
1034 matrixOp,
"the matrix op payload must be a vector type");
1036 auto loc = matrixOp.getLoc();
1037 auto offsets = matrixOp.getMixedOffsets();
1038 if (offsets.empty())
1040 "the store op must have offsets");
1041 SmallVector<Value> offsetsAsValues =
1044 auto layout = matrixOp.getLayoutAttr();
1047 matrixOp,
"the matrix operation lacks layout attribute");
1049 FailureOr<VectorType> distPayloadByWarpOpOrFailure =
1050 getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
1051 if (
failed(distPayloadByWarpOpOrFailure))
1053 matrixOp,
"Failed to distribute matrix op payload based on layout.");
1055 SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
1056 const unsigned offsetsStartIdx = operands.size();
1057 operands.append(offsetsAsValues);
1059 SmallVector<Type> operandTypes = llvm::to_vector(
1060 llvm::map_range(operands, [](Value v) {
return v.
getType(); }));
1061 operandTypes[0] = *distPayloadByWarpOpOrFailure;
1063 SmallVector<size_t> newRetIndices;
1064 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1065 rewriter, warpOp, operands, operandTypes, newRetIndices);
1066 SmallVector<Value> newOperands = llvm::map_to_vector(
1067 newRetIndices, [&](
size_t idx) {
return newWarpOp.getResult(idx); });
1069 SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
1070 std::fill(newConstOffsets.begin(), newConstOffsets.end(),
1071 ShapedType::kDynamic);
1075 ValueRange(newOperands).drop_front(offsetsStartIdx);
1077 SmallVector<Value> newCoords = currentOffsets;
1080 if (!matrixOp.getSubgroupBlockIoAttr()) {
1081 newCoords = computeDistributedCoordinatesForMatrixOp(
1082 rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
1086 xegpu::StoreMatrixOp::create(
1087 rewriter, loc,
TypeRange{}, newOperands[0], newOperands[1],
1089 matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
1115 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1116 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1117 PatternRewriter &rewriter)
const override {
1118 OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
1121 return isa<xegpu::LoadGatherOp>(op) &&
1122 warpOp.getTerminator()->getPrevNode() == op;
1124 if (!producedByLastLoad)
1126 warpOp,
"The last op is not xegpu::LoadGatherOp");
1130 auto offsets = loadGatherOp.getOffsets();
1131 if (!offsets || !isa<VectorType>(offsets.getType()) ||
1132 !isa<VectorType>(loadGatherOp.getMask().getType()))
1135 "Load op must have a vector arguments for offsets and mask");
1136 VectorType offsetsTy = cast<VectorType>(offsets.getType());
1137 VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
1138 if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
1140 "Expected 1D offsets and mask vector");
1142 std::string layoutOffsetsName =
1144 std::string layoutMaskName =
1147 xegpu::LayoutAttr layoutOffsets =
1148 loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
1149 xegpu::LayoutAttr layoutMask =
1150 loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
1152 FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
1153 getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
1154 FailureOr<VectorType> distMaskByWarpOpOrFailure =
1155 getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
1156 if (
failed(distOffsetsByWarpOpOrFailure) ||
1157 failed(distMaskByWarpOpOrFailure)) {
1160 "Some vector operands have no layouts, using defaults instead.");
1163 SmallVector<size_t> newRetIndices;
1164 SmallVector<Value> operands = loadGatherOp->getOperands();
1165 SmallVector<Type> operandTypesToYield = {
1166 operands[0].getType(), distOffsetsByWarpOpOrFailure.value(),
1167 distMaskByWarpOpOrFailure.value()};
1170 VectorType distResultTy =
1171 cast<VectorType>(warpOp.getResult(operandIdx).getType());
1173 VectorType loadVecTy = VectorType::get({distResultTy.getNumElements()},
1174 distResultTy.getElementType());
1176 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1177 rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
1179 SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector(
1180 newRetIndices, [&](
size_t idx) {
return newWarpOp.getResult(idx); });
1183 xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
1184 rewriter, newWarpOp.getLoc(), loadVecTy, newLoadGatherOperands,
1185 loadGatherOp->getAttrs());
1187 Value distributedVal = newWarpOp.getResult(operandIdx);
1191 resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
1200 vector::CombiningKind kind,
1204 assert(src.getType().getRank() == 2 &&
"expected a 2D source vector");
1205 VectorType sourceType = src.getType();
1206 int64_t sourceH = sourceType.getShape()[0];
1207 int64_t sourceW = sourceType.getShape()[1];
1208 int nSlices = (reductionDim == 0) ? sourceW : sourceH;
1210 TypedAttr zeroAttr = rewriter.
getZeroAttr(sourceType.getElementType());
1211 Value reductionResult = arith::ConstantOp::create(
1212 rewriter, loc,
acc.getType(),
1219 for (
int i = 0; i < nSlices; ++i) {
1221 if (reductionDim == 1) {
1222 sliceOffsets = {i, 0};
1223 sliceSizes = {1, sourceW};
1225 sliceOffsets = {0, i};
1226 sliceSizes = {sourceH, 1};
1228 vector::ExtractStridedSliceOp extractOp =
1229 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
1230 sliceSizes, {1, 1});
1231 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
1232 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
1234 VectorType::get({nSliceElements}, sourceType.getElementType()),
1235 extractOp.getResult());
1246 Value accExtract = vector::ExtractOp::create(rewriter, loc,
acc, i);
1247 Value reduction = vector::ReductionOp::create(
1248 rewriter, loc, kind, slice.getResult(), accExtract);
1250 vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
1252 return reductionResult;
1311 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1312 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1313 PatternRewriter &rewriter)
const override {
1314 OpOperand *yieldOperand =
1315 getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>);
1321 VectorType sourceType = reductionOp.getSourceVectorType();
1323 if (sourceType.getRank() != 2)
1325 "Only 2D reductions are supported.");
1326 ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
1329 if (reductionDims.size() != 1)
1331 warpOp,
"Only 1 reduction dimension is supported.");
1332 int64_t reductionDim = reductionDims[0];
1333 VectorType distributedResultType =
1334 cast<VectorType>(warpOp.getResult(operandIdx).getType());
1335 VectorType resultType = cast<VectorType>(reductionOp.getType());
1336 xegpu::DistributeLayoutAttr sourceLayout =
1339 FailureOr<VectorType> sourceDistTypeOrFailure =
1340 getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
1341 if (
failed(sourceDistTypeOrFailure))
1343 warpOp,
"Failed to distribute the source vector type.");
1344 VectorType sourceDistType = sourceDistTypeOrFailure.value();
1346 bool dim0Distributed =
1347 sourceDistType.getShape()[0] != sourceType.getShape()[0];
1348 bool dim1Distributed =
1349 sourceDistType.getShape()[1] != sourceType.getShape()[1];
1350 if (dim0Distributed && dim1Distributed)
1352 warpOp,
"Expecting source to be distributed in a single dimension.");
1353 int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
1354 if (sourceDistDim == -1)
1356 warpOp,
"Expecting a distributed source vector.");
1357 bool resultDistributed =
1358 distributedResultType.getNumElements() < resultType.getNumElements();
1372 bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
1373 (sourceDistDim == 1 && reductionDim == 0);
1374 if (isReductionLaneLocal && !resultDistributed)
1376 warpOp,
"Expecting a distributed result for lane-local reduction.");
1378 if (!isReductionLaneLocal && resultDistributed)
1381 "Expecting a broadcasted result for non-lane-local reduction.");
1385 if (isReductionLaneLocal) {
1387 SmallVector<size_t> newRetIndices;
1388 auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1389 rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
1390 {sourceDistType, distributedResultType}, newRetIndices);
1392 Value
result = lowerToVectorReductions(
1395 reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
1404 Value
result = lowerToVectorReductions(
1407 reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
1417 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1418 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1419 PatternRewriter &rewriter)
const override {
1420 OpOperand *yieldOperand =
1421 getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
1428 cast<VectorType>(warpOp.getResult(operandNumber).getType());
1429 xegpu::DistributeLayoutAttr sourceLayout =
1431 xegpu::DistributeLayoutAttr resultLayout =
1433 if (!sourceLayout || !resultLayout)
1436 "the source or result of shape_cast op lacks distribution layout");
1440 int64_t sourceRank = shapeCastOp.getSourceVectorType().getRank();
1441 int64_t resultRank = shapeCastOp.getResultVectorType().getRank();
1442 if (sourceRank < resultRank && !sourceLayout.isSliceOf(resultLayout))
1444 warpOp,
"shape_cast is rank reducing but source layout is not a "
1445 "slice of result layout");
1446 if (sourceRank > resultRank && !resultLayout.isSliceOf(sourceLayout))
1448 warpOp,
"shape_cast is rank increasing but result layout is not a "
1449 "slice of source layout");
1451 FailureOr<VectorType> sourceDistTypeOrFailure =
1452 getDistVecTypeBasedOnLaneLayout(sourceLayout,
1453 shapeCastOp.getSourceVectorType());
1454 if (
failed(sourceDistTypeOrFailure))
1456 warpOp,
"failed to get distributed vector type for source");
1457 VectorType sourceDistType = sourceDistTypeOrFailure.value();
1459 SmallVector<size_t> newRetIndices;
1460 auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1461 rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType},
1464 Value source = newWarpOp.getResult(newRetIndices[0]);
1466 Value newShapeCast = vector::ShapeCastOp::create(
1467 rewriter, shapeCastOp.getLoc(), resultDistTy, source);
1477struct MemrefExtractAlignedPointerAsIndexDistribution final
1479 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1480 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1481 PatternRewriter &rewriter)
const override {
1482 OpOperand *operand = getWarpResult(
1483 warpOp, llvm::IsaPred<memref::ExtractAlignedPointerAsIndexOp>);
1487 "warp result is not a memref::MemrefExtractAlignedPointerAsIndex op");
1491 SmallVector<size_t> newRetIndices;
1492 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1493 rewriter, warpOp, extractOp.getSource(),
1494 TypeRange{extractOp.getSource().getType()}, newRetIndices);
1496 auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create(
1497 rewriter, newWarpOp.getLoc(), extractOp.getType(),
1498 newWarpOp.getResult(newRetIndices[0]));
1499 Value distributedVal = newWarpOp.getResult(operandIdx);
1511 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1512 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1513 PatternRewriter &rewriter)
const override {
1514 OpOperand *operand =
1515 getWarpResult(warpOp, llvm::IsaPred<vector::BitCastOp>);
1518 warpOp,
"warp result is not a vector::BitCast op");
1521 VectorType distributedSourceType =
1522 getDistVecTypeBasedOnLaneLayout(
1524 bitcastOp.getSourceVectorType())
1525 .value_or(VectorType());
1526 if (!distributedSourceType)
1528 bitcastOp,
"Failed to distribute the source vector type in "
1529 "vector::BitCast op");
1530 VectorType distributedResultType =
1531 cast<VectorType>(warpOp.getResult(operandIdx).getType());
1532 SmallVector<size_t> newRetIndices;
1533 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1534 rewriter, warpOp, bitcastOp.getSource(),
1535 TypeRange{distributedSourceType}, newRetIndices);
1537 auto newBitcastOp = vector::BitCastOp::create(
1538 rewriter, newWarpOp.getLoc(), distributedResultType,
1539 newWarpOp.getResult(newRetIndices[0]));
1540 Value distributedVal = newWarpOp.getResult(operandIdx);
1555 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1556 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1557 PatternRewriter &rewriter)
const override {
1558 OpOperand *operand =
1559 getWarpResult(warpOp, llvm::IsaPred<vector::TransposeOp>);
1562 warpOp,
"warp result is not a vector::Transpose op");
1565 xegpu::DistributeLayoutAttr sourceLayout =
1567 xegpu::DistributeLayoutAttr resultLayout =
1569 if (!sourceLayout || !resultLayout)
1572 "the source or result vector of the transpose op lacks layout "
1574 int64_t sourceRank = transposeOp.getSourceVectorType().getRank();
1575 int64_t resultRank = transposeOp.getResultVectorType().getRank();
1578 if (sourceRank != 2 || resultRank != 2)
1580 transposeOp,
"the source or result vector of the transpose op "
1581 "does not have 2D layout");
1582 ArrayRef<int64_t> perm = transposeOp.getPermutation();
1584 if (!resultLayout.isTransposeOf(sourceLayout, perm))
1587 "the source or result vector layouts must be 2D transposes of each "
1589 FailureOr<VectorType> distributedSourceTypeOrFailure =
1590 getDistVecTypeBasedOnLaneLayout(sourceLayout,
1591 transposeOp.getSourceVectorType());
1592 if (
failed(distributedSourceTypeOrFailure))
1594 transposeOp,
"Failed to distribute the source vector type in "
1595 "vector::Transpose op");
1596 SmallVector<size_t> newRetIndices;
1597 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1598 rewriter, warpOp, transposeOp.getVector(),
1599 TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices);
1601 auto newTransposeOp = vector::TransposeOp::create(
1602 rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]),
1604 Value distributedVal = newWarpOp.getResult(operandIdx);
1613struct XeGPUSubgroupDistributePass final
1615 XeGPUSubgroupDistributePass> {
1616 void runOnOperation()
override;
1622 patterns.add<CreateNdDescDistribution, StoreNdDistribution,
1623 LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
1624 GpuBarrierDistribution, VectorMultiReductionDistribution,
1625 LoadDistribution, StoreDistribution, VectorTransposeDistribution,
1626 VectorBitcastDistribution, LoadMatrixDistribution,
1627 StoreMatrixDistribution,
1628 MemrefExtractAlignedPointerAsIndexDistribution>(
1630 regularPatternBenefit);
1631 patterns.add<VectorShapeCastDistribution>(
1633 highPatternBenefit);
1641void XeGPUSubgroupDistributePass::runOnOperation() {
1650 if (!isa<VectorType>(operand.
get().
getType()))
1652 if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(op))
1657 op->
emitError(
"Could not find layout attribute for operand ")
1659 signalPassFailure();
1672 signalPassFailure();
1679 getOperation()->walk([&](Operation *op) {
1680 if (
auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
1681 vector::moveScalarUniformCode(warpOp);
1690 auto distributionFn = [](Value val) {
1691 VectorType vecType = dyn_cast<VectorType>(val.getType());
1692 int64_t vecRank = vecType ? vecType.getRank() : 0;
1702 assert(layout.getRank() == vecRank &&
1703 "Expecting vector and layout rank to match");
1707 SmallVector<unsigned int> distributedDims;
1708 for (
auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
1709 if (v > 1 && vecType.getShape()[i] % v == 0)
1710 distributedDims.push_back(i);
1716 auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
1717 int64_t warpSz) {
return Value(); };
1719 auto warpReduction = [](Location loc, OpBuilder &builder, Value input,
1720 vector::CombiningKind kind, uint32_t size) {
1722 Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
1724 for (uint64_t i = 1; i < size; i <<= 1) {
1725 Value shuffled = gpu::ShuffleOp::create(builder, loc, laneVal, i,
1727 gpu::ShuffleMode::XOR)
1728 .getShuffleResult();
1734 vector::populateDistributeReduction(
1736 regularPatternBenefit);
1738 vector::populatePropagateWarpVectorDistributionPatterns(
1739 patterns, distributionFn, shuffleFn,
1740 regularPatternBenefit);
1742 signalPassFailure();
1752 bool foundWarpOp =
false;
1753 getOperation()->walk([&](gpu::WarpExecuteOnLane0Op warpOp) {
1763 getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
1769 Value input = op.getOperand(0);
1770 Value output = op.getResult(0);
1773 xegpu::TensorDescType inputDescType =
1774 mlir::dyn_cast<xegpu::TensorDescType>(input.
getType());
1775 xegpu::TensorDescType outputDescType =
1776 mlir::dyn_cast<xegpu::TensorDescType>(output.
getType());
1777 assert(inputDescType && outputDescType &&
1778 "Unrealized conversion cast must have tensor descriptor types");
1783 if (inputDescType.getLayout()) {
1784 auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
1786 argument.setType(output.
getType());
1788 if (
auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
1789 argument.getOwner()->getParentOp())) {
1790 auto result = loopOp.getTiedLoopResult(argument);
1799 if (outputDescType.getLayout())
1802 if (op->use_empty())
static const char *const resolveSIMTTypeMismatch
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
static AffineMap getMultiDimMapWithTargets(unsigned numDims, ArrayRef< unsigned > targets, MLIRContext *context)
Returns an affine map with numDims input dimensions and results specified by targets.
This class represents an argument of a Block.
Block represents an ordered list of Operations.
DenseI64ArrayAttr getDenseI64ArrayAttr(ArrayRef< int64_t > values)
FunctionType getFunctionType(TypeRange inputs, TypeRange results)
TypedAttr getZeroAttr(Type type)
MLIRContext * getContext() const
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
IRValueT get() const
Return the current value being used by this operand.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents an operand of an operation.
unsigned getOperandNumber()
Return which operand this is in the OpOperand list of the Operation.
Operation is the basic unit of execution within MLIR.
MutableArrayRef< OpOperand > getOpOperands()
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
OperationName getName()
The name of an operation is the key identifier for it.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
virtual void eraseBlock(Block *block)
This method erases all operations in a block.
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
void inlineRegionBefore(Region ®ion, Region &parent, Region::iterator before)
Move the blocks that belong to "region" before the given position in another region "parent".
virtual void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
This class provides an abstraction over the various different ranges of value types.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
void replaceAllUsesWith(Value newValue)
Replace all uses of 'this' value with the new value, updating anything in the IR that uses 'this' to ...
Location getLoc() const
Return the location of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
static WalkResult advance()
static WalkResult interrupt()
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int64_t > content)
Value makeArithReduction(OpBuilder &b, Location loc, CombiningKind kind, Value v1, Value acc, arith::FastMathFlagsAttr fastmath=nullptr, Value mask=nullptr)
Returns the result value of reducing two scalar/vector values with the corresponding arith operation.
SmallVector< Value > getAsValues(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > foldResults)
Convert foldResults into Values.
const uArch * getUArch(llvm::StringRef archName)
void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns)
Appends patterns for moving function body into gpu.warp_execute_on_lane0 op.
std::string getLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
void setDistributeLayoutAttr(const T &operandOrResult, const DistributeLayoutAttr layout, bool respectPermLayout=false)
Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching it to the owner's dictio...
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
void removeLayoutAttrs(Operation *op)
Removes the DistributeLayoutAttr for each OpOperand and OpResult of the given operation if they exist...
void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns)
Appends patterns for XeGPU SIMT distribution into patterns.
SmallVector< OpFoldResult > addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with automatic alignment.
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Include the generated interface declarations.
detail::DenseArrayAttrImpl< int64_t > DenseI64ArrayAttr
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
bool isOpTriviallyDead(Operation *op)
Return true if the given operation is unused, and has no side effects on memory that prevent erasing.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
const FrozenRewritePatternSet & patterns
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
LogicalResult matchAndRewrite(Operation *op, PatternRewriter &rewriter) const final
Wrapper around the RewritePattern method that passes the derived op type.
virtual int getSubgroupSize() const =0
StringRef getName() const