37 enum class CuSparseFormat {
50 static void markAsGPUContainer(ModuleOp topModule) {
51 topModule->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
57 static gpu::GPUModuleOp genGPUModule(
OpBuilder &builder, ModuleOp topModule) {
58 for (
auto op : topModule.getBodyRegion().getOps<gpu::GPUModuleOp>())
60 markAsGPUContainer(topModule);
62 return builder.
create<gpu::GPUModuleOp>(topModule->getLoc(),
67 static gpu::GPUFuncOp genGPUFunc(
OpBuilder &builder, gpu::GPUModuleOp gpuModule,
71 unsigned kernelNumber = 0;
75 (
"kernel" + Twine(kernelNumber++)).toStringRef(kernelName);
76 }
while (gpuModule.lookupSymbol(kernelName));
81 argsTp.push_back(arg.getType());
84 builder.
create<gpu::GPUFuncOp>(gpuModule->getLoc(), kernelName, type);
85 gpuFunc->
setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
91 static Value genLaunchGPUFunc(
OpBuilder &builder, gpu::GPUFuncOp gpuFunc,
94 unsigned numThreads) {
102 .
create<gpu::LaunchFuncOp>(loc, gpuFunc, gridSize, blckSize,
115 MemRefType memTp = cast<MemRefType>(mem.
getType());
118 Value cast = builder.
create<memref::CastOp>(loc, resTp, mem);
119 builder.
create<gpu::HostRegisterOp>(loc, cast);
126 builder.
create<gpu::HostUnregisterOp>(loc, cast);
139 builder.
create<gpu::WaitOp>(loc,
Type(), operands);
148 auto tp = cast<ShapedType>(mem.
getType());
149 auto elemTp = tp.getElementType();
150 auto shape = tp.getShape();
153 for (
unsigned r = 0, rank = tp.getRank(); r < rank; r++) {
154 if (shape[r] == ShapedType::kDynamic) {
156 dynamicSizes.push_back(dimOp);
167 return builder.
create<memref::AllocOp>(loc, memTp, size).getResult();
181 return genAllocBuffer(builder, loc, builder.
getI8Type(), size, token);
187 return builder.
create<gpu::DeallocOp>(loc, token.
getType(), token, mem)
194 return builder.
create<gpu::MemcpyOp>(loc, token.
getType(), token, dst, src)
201 Value firstToken = genFirstWait(builder, loc);
202 auto alloc = genAllocMemRef(builder, loc, b, firstToken);
203 Value devMem = alloc.getResult(0);
204 Value depToken = alloc.getAsyncToken();
205 tokens.push_back(genCopyMemRef(builder, loc, devMem, b, depToken));
212 auto tensorType = llvm::cast<ShapedType>(tensor.
getType());
215 return rewriter.
create<bufferization::ToMemrefOp>(loc, memrefType, tensor);
227 bool useHostRegistrationForOut) {
230 for (
Value s : scalars)
233 for (
Value b : buffers) {
234 if (useHostRegistrationForOut) {
235 out = genHostRegisterMemref(builder, loc, b);
237 useHostRegistrationForOut =
false;
240 args.push_back(genAllocCopy(builder, loc, b, tokens));
253 unsigned base = scalars.size();
254 for (
unsigned i = base, e = args.size(); i < e; i++) {
259 genHostUnregisterMemref(builder, loc, out);
264 genCopyMemRef(builder, loc, buffers[0], args[i], kernelToken);
266 firstToken = genFirstWait(builder, loc);
268 tokens.push_back(genDeallocMemRef(builder, loc, args[i], firstToken));
273 static void genGPUCode(
PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc,
274 scf::ParallelOp forallOp,
285 for (
Value c : constants)
287 for (
Value s : scalars)
289 for (
Value b : buffers)
296 Value bid = rewriter.
create<gpu::BlockIdOp>(loc, gpu::Dimension::x);
297 Value bsz = rewriter.
create<gpu::BlockDimOp>(loc, gpu::Dimension::x);
298 Value tid = rewriter.
create<gpu::ThreadIdOp>(loc, gpu::Dimension::x);
299 Value gsz = rewriter.
create<gpu::GridDimOp>(loc, gpu::Dimension::x);
300 Value mul = rewriter.
create<arith::MulIOp>(loc, bid, bsz);
301 Value row = rewriter.
create<arith::AddIOp>(loc, mul, tid);
302 Value inc = rewriter.
create<arith::MulIOp>(loc, bsz, gsz);
310 Value upper = irMap.
lookup(forallOp.getUpperBound()[0]);
311 scf::ForOp forOp = rewriter.
create<scf::ForOp>(loc, row, upper, inc);
317 forOp.getRegion().begin(), irMap);
324 rewriter.
create<gpu::ReturnOp>(gpuFunc->getLoc());
332 static bool matchAddOfArgs(
Block *block,
Value val) {
334 if (isa<arith::AddFOp, arith::AddIOp>(def)) {
337 return (def->getOperand(0) == a && def->getOperand(1) == b) ||
338 (def->getOperand(0) == b && def->getOperand(1) == a);
345 static bool matchMulOfArgs(
Block *block,
Value val) {
347 if (isa<arith::MulFOp, arith::MulIOp>(def)) {
350 return (def->getOperand(0) == a && def->getOperand(1) == b) ||
351 (def->getOperand(0) == b && def->getOperand(1) == a);
358 static bool matchSumOfMultOfArgs(linalg::GenericOp op) {
359 auto yieldOp = cast<linalg::YieldOp>(op.getRegion().front().getTerminator());
360 if (
auto *def = yieldOp.getOperand(0).getDefiningOp()) {
361 if (isa<arith::AddFOp, arith::AddIOp>(def)) {
362 Value x = op.getBlock()->getArguments()[2];
363 return (def->getOperand(0) == x &&
364 matchMulOfArgs(op.getBlock(), def->getOperand(1))) ||
365 (def->getOperand(1) == x &&
366 matchMulOfArgs(op.getBlock(), def->getOperand(0)));
373 static bool matchSumReductionOfMulUnary(linalg::GenericOp op) {
374 auto yieldOp = cast<linalg::YieldOp>(op.getRegion().front().getTerminator());
376 Value s_out = op.getBlock()->getArguments()[2];
378 yieldOp.getOperand(0).getDefiningOp<sparse_tensor::ReduceOp>()) {
381 if (s_out == redOp->getOperand(0))
382 other = redOp->getOperand(1);
383 else if (s_out == redOp->getOperand(1))
384 other = redOp->getOperand(0);
389 if (
auto unOp = other.
getDefiningOp<sparse_tensor::UnaryOp>()) {
390 if (s_out != unOp->getOperand(0) || !unOp.getAbsentRegion().empty())
393 auto yieldUn = cast<sparse_tensor::YieldOp>(
394 unOp.getRegion(0).front().getTerminator());
395 auto yieldRed = cast<sparse_tensor::YieldOp>(
396 redOp.getRegion().front().getTerminator());
397 return matchMulOfArgs(op.getBlock(), yieldUn.getOperand(0)) &&
398 matchAddOfArgs(&redOp.getRegion().front(), yieldRed.getOperand(0));
405 static bool isDenseTensor(
Value v) {
407 return sTp.getDimRank() == sTp.getLvlRank() && sTp.isAllDense();
421 isAdmissibleMetaData(aTp);
445 assert(dims.size() == 2);
446 return dims[0] == dims[1] && dims[0] > 1;
458 static bool isConversionInto24(
Value v) {
460 Value a = cnv.getResult();
461 Value d = cnv.getSource();
463 return isDenseTensor(d) && isAdmissible24(aTp);
476 return CuSparseFormat::kNone;
478 if (isAdmissibleCOO(aTp))
479 #ifdef CUSPARSE_COO_AOS
480 return isMatVec ? CuSparseFormat::kCOO : CuSparseFormat::kNone;
482 return enableRT ? CuSparseFormat::kCOO : CuSparseFormat::kNone;
484 if (isAdmissibleCSR(aTp))
485 return CuSparseFormat::kCSR;
486 if (isAdmissibleCSC(aTp))
487 return CuSparseFormat::kCSC;
488 if (isAdmissibleBSR(aTp))
489 return CuSparseFormat::kBSR;
490 return CuSparseFormat::kNone;
495 CuSparseFormat format,
bool enableRT) {
496 if (format == CuSparseFormat::kCOO) {
499 return builder.
create<ToCoordinatesOp>(loc, a, 0);
500 return builder.
create<ToCoordinatesBufferOp>(loc, a);
503 return builder.
create<ToPositionsOp>(loc, a, 1);
508 CuSparseFormat format,
bool enableRT) {
509 bool isCOO = format == CuSparseFormat::kCOO;
510 if (isCOO && !enableRT)
513 return builder.
create<ToCoordinatesOp>(loc, a, 1);
521 CuSparseFormat format,
bool enableRT) {
522 if (format == CuSparseFormat::kCOO) {
526 return builder.
create<gpu::CreateCooOp>(loc, handleTp, tokenTp, token,
527 sz1, sz2, nseA, rowA, colA, valA);
529 #ifdef CUSPARSE_COO_AOS
531 return builder.
create<gpu::CreateCooAoSOp>(loc, handleTp, tokenTp, token,
532 sz1, sz2, nseA, rowA, valA);
534 llvm_unreachable(
"gpu::CreateCooAoSOp is deprecated");
538 if (format == CuSparseFormat::kCSR)
539 return builder.
create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, sz1,
540 sz2, nseA, rowA, colA, valA);
541 if (format == CuSparseFormat::kCSC)
542 return builder.
create<gpu::CreateCscOp>(loc, handleTp, tokenTp, token, sz1,
543 sz2, nseA, rowA, colA, valA);
547 assert(format == CuSparseFormat::kBSR);
549 assert(dims.size() == 2 && dims[0] == dims[1]);
550 uint64_t b = dims[0];
552 Value bRows = builder.
create<arith::DivUIOp>(loc, sz1, bSz);
553 Value bCols = builder.
create<arith::DivUIOp>(loc, sz2, bSz);
556 return builder.
create<gpu::CreateBsrOp>(loc, handleTp, tokenTp, token, bRows,
557 bCols, bNum, bSz, bSz, rowA, colA,
563 linalg::GenericOp op,
bool enableRT) {
565 Value a = op.getOperand(0);
566 Value x = op.getOperand(1);
567 Value y = op.getOperand(2);
574 auto format = getCuSparseFormat(aTp, xTp, yTp, enableRT,
true);
575 if (format == CuSparseFormat::kNone || format == CuSparseFormat::kBSR)
582 Value nseA = rewriter.
create<NumberOfEntriesOp>(loc, a);
585 Value memR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
586 Value memC = genSecondCrds(rewriter, loc, a, format, enableRT);
588 Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
589 Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) :
Value();
590 Value valA = genAllocCopy(rewriter, loc, memV, tokens);
591 Value memX = genTensorToMemref(rewriter, loc, x);
592 Value vecX = genAllocCopy(rewriter, loc, memX, tokens);
593 Value memY = genTensorToMemref(rewriter, loc, y);
594 Value vecY = genAllocCopy(rewriter, loc, memY, tokens);
595 genBlockingWait(rewriter, loc, tokens);
603 Value token = genFirstWait(rewriter, loc);
605 genSpMat(rewriter, loc, aTp, spmatHandleTp, tokenTp, token, szY, szX,
606 nseA, rowA, colA, valA, format, enableRT);
609 auto dvecX = rewriter.
create<gpu::CreateDnTensorOp>(
610 loc, dnTensorHandleTp, tokenTp, token, vecX, szX);
611 Value dnX = dvecX.getResult(0);
612 token = dvecX.getAsyncToken();
613 auto dvecY = rewriter.
create<gpu::CreateDnTensorOp>(
614 loc, dnTensorHandleTp, tokenTp, token, vecY, szY);
615 Value dnY = dvecY.getResult(0);
616 token = dvecY.getAsyncToken();
617 auto dnYType = llvm::cast<ShapedType>(y.
getType()).getElementType();
620 auto bufferComp = rewriter.
create<gpu::SpMVBufferSizeOp>(
621 loc, indexTp, tokenTp, token, spMatA, dnX, dnY,
623 Value bufferSz = bufferComp.getResult(0);
624 token = bufferComp.getAsyncToken();
625 auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
626 Value buffer = buf.getResult(0);
627 token = buf.getAsyncToken();
630 auto spmvComp = rewriter.
create<gpu::SpMVOp>(
631 loc, tokenTp, token, spMatA, dnX, dnY, dnYType, buffer);
632 token = spmvComp.getAsyncToken();
635 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
637 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnX)
639 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnY)
641 token = genDeallocMemRef(rewriter, loc, rowA, token);
643 token = genDeallocMemRef(rewriter, loc, colA, token);
644 token = genDeallocMemRef(rewriter, loc, valA, token);
645 token = genDeallocMemRef(rewriter, loc, buffer, token);
646 token = genDeallocMemRef(rewriter, loc, vecX, token);
647 token = genCopyMemRef(rewriter, loc, memY, vecY, token);
648 token = genDeallocMemRef(rewriter, loc, vecY, token);
649 tokens.push_back(token);
650 genBlockingWait(rewriter, loc, tokens);
660 linalg::GenericOp op,
bool enableRT) {
662 Value a = op.getOperand(0);
663 Value b = op.getOperand(1);
664 Value c = op.getOperand(2);
671 auto format = getCuSparseFormat(aTp, bTp, cTp, enableRT,
false);
672 if (format == CuSparseFormat::kNone || format == CuSparseFormat::kBSR)
679 Value nseA = rewriter.
create<NumberOfEntriesOp>(loc, a);
683 Value memR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
684 Value memC = genSecondCrds(rewriter, loc, a, format, enableRT);
686 Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
687 Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) :
Value();
688 Value valA = genAllocCopy(rewriter, loc, memV, tokens);
689 Value bufB = genTensorToMemref(rewriter, loc, b);
690 Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
691 Value bufC = genTensorToMemref(rewriter, loc, c);
692 Value matC = genAllocCopy(rewriter, loc, bufC, tokens);
693 genBlockingWait(rewriter, loc, tokens);
701 Value token = genFirstWait(rewriter, loc);
703 genSpMat(rewriter, loc, aTp, spMatHandleTp, tokenTp, token, szm, szk,
704 nseA, rowA, colA, valA, format, enableRT);
707 auto dmatB = rewriter.
create<gpu::CreateDnTensorOp>(
708 loc, dnTensorHandleTp, tokenTp, token, matB,
710 Value dnB = dmatB.getResult(0);
711 token = dmatB.getAsyncToken();
712 auto dmatC = rewriter.
create<gpu::CreateDnTensorOp>(
713 loc, dnTensorHandleTp, tokenTp, token, matC,
715 Value dnC = dmatC.getResult(0);
716 token = dmatC.getAsyncToken();
717 auto dmatCType = llvm::cast<ShapedType>(c.
getType()).getElementType();
720 auto bufferComp = rewriter.
create<gpu::SpMMBufferSizeOp>(
721 loc, indexTp, tokenTp, token, spMatA, dnB, dnC,
723 Value bufferSz = bufferComp.getResult(0);
724 token = bufferComp.getAsyncToken();
725 auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
726 Value buffer = buf.getResult(0);
727 token = buf.getAsyncToken();
728 auto dnCType = llvm::cast<ShapedType>(c.
getType()).getElementType();
731 auto spmmComp = rewriter.
create<gpu::SpMMOp>(
732 loc, tokenTp, token, spMatA, dnB, dnC, dnCType, buffer);
733 token = spmmComp.getAsyncToken();
736 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
738 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
740 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
742 token = genDeallocMemRef(rewriter, loc, rowA, token);
744 token = genDeallocMemRef(rewriter, loc, colA, token);
745 token = genDeallocMemRef(rewriter, loc, valA, token);
746 token = genDeallocMemRef(rewriter, loc, buffer, token);
747 token = genDeallocMemRef(rewriter, loc, matB, token);
748 token = genCopyMemRef(rewriter, loc, bufC, matC, token);
749 token = genDeallocMemRef(rewriter, loc, matC, token);
750 tokens.push_back(token);
751 genBlockingWait(rewriter, loc, tokens);
761 linalg::GenericOp op,
bool enableRT) {
763 Value a = op.getOperand(0);
764 Value b = op.getOperand(1);
765 Value c = op.getOperand(2);
769 auto format = CuSparseFormat::kCSR;
773 if (!isAdmissibleCSR(aTp) || !isAdmissibleCSR(bTp) || !isAdmissibleCSR(cTp))
781 Value nseA = rewriter.
create<NumberOfEntriesOp>(loc, a);
782 Value nseB = rewriter.
create<NumberOfEntriesOp>(loc, b);
786 Value amemR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
787 Value amemC = genSecondCrds(rewriter, loc, a, format, enableRT);
789 Value bmemR = genFirstPosOrCrds(rewriter, loc, b, format, enableRT);
790 Value bmemC = genSecondCrds(rewriter, loc, b, format, enableRT);
792 Value rowA = genAllocCopy(rewriter, loc, amemR, tokens);
793 Value colA = genAllocCopy(rewriter, loc, amemC, tokens);
794 Value valA = genAllocCopy(rewriter, loc, amemV, tokens);
795 Value rowB = genAllocCopy(rewriter, loc, bmemR, tokens);
796 Value colB = genAllocCopy(rewriter, loc, bmemC, tokens);
797 Value valB = genAllocCopy(rewriter, loc, bmemV, tokens);
798 genBlockingWait(rewriter, loc, tokens);
806 Value token = genFirstWait(rewriter, loc);
808 genSpMat(rewriter, loc, aTp, spmatHandleTp, tokenTp, token, szm, szk,
809 nseA, rowA, colA, valA, format, enableRT);
813 genSpMat(rewriter, loc, bTp, spmatHandleTp, tokenTp, token, szk, szn,
814 nseB, rowB, colB, valB, format, enableRT);
821 Value mplus1 = rewriter.
create<arith::AddIOp>(loc, szm, one);
822 auto e1 = genAllocBuffer(rewriter, loc, cTp.
getPosType(), mplus1, token);
823 Value rowC = e1.getResult(0);
824 token = e1.getAsyncToken();
825 auto e2 = genAllocBuffer(rewriter, loc, cTp.
getCrdType(), zero, token);
826 Value colC = e2.getResult(0);
827 token = e2.getAsyncToken();
828 auto e3 = genAllocBuffer(rewriter, loc, dnCType, zero, token);
829 Value valC = e3.getResult(0);
830 token = e3.getAsyncToken();
832 genSpMat(rewriter, loc, cTp, spmatHandleTp, tokenTp, token, szm, szn,
833 zero, rowC, colC, valC, format, enableRT);
839 rewriter.
create<gpu::SpGEMMCreateDescrOp>(loc, descTp, tokenTp, token);
842 Operation *work1 = rewriter.
create<gpu::SpGEMMWorkEstimationOrComputeOp>(
843 loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
844 gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType, zero,
845 valC, gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION);
848 auto buf1 = genAllocBuffer(rewriter, loc, bufferSz1, token);
849 Value buffer1 = buf1.getResult(0);
850 token = buf1.getAsyncToken();
851 Operation *work2 = rewriter.
create<gpu::SpGEMMWorkEstimationOrComputeOp>(
852 loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
853 gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType,
855 gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION);
859 Operation *compute1 = rewriter.
create<gpu::SpGEMMWorkEstimationOrComputeOp>(
860 loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
861 gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType, zero,
862 valC, gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);
865 auto buf2 = genAllocBuffer(rewriter, loc, bufferSz2, token);
866 Value buffer2 = buf2.getResult(0);
867 token = buf2.getAsyncToken();
868 Operation *compute2 = rewriter.
create<gpu::SpGEMMWorkEstimationOrComputeOp>(
869 loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
870 gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType,
871 bufferSz2, buffer2, gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);
876 loc, indexTp, indexTp, indexTp, tokenTp, token, spMatC);
879 auto a2 = genAllocBuffer(rewriter, loc, cTp.
getCrdType(), nnz, token);
880 colC = a2.getResult(0);
881 token = a2.getAsyncToken();
882 auto a3 = genAllocBuffer(rewriter, loc, dnCType, nnz, token);
883 valC = a3.getResult(0);
884 token = a3.getAsyncToken();
888 loc, tokenTp, token, spMatC, rowC, colC, valC);
891 loc, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
892 gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType);
893 token =
copy->getResult(0);
898 Value valH = genHostBuffer(rewriter, loc, dnCType, nnz);
901 token = rewriter.
create<gpu::SpGEMMDestroyDescrOp>(loc, tokenTp, token, desc)
903 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
905 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatB)
907 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
909 token = genCopyMemRef(rewriter, loc, rowH, rowC, token);
910 token = genCopyMemRef(rewriter, loc, colH, colC, token);
911 token = genCopyMemRef(rewriter, loc, valH, valC, token);
912 token = genDeallocMemRef(rewriter, loc, rowA, token);
913 token = genDeallocMemRef(rewriter, loc, colA, token);
914 token = genDeallocMemRef(rewriter, loc, valA, token);
915 token = genDeallocMemRef(rewriter, loc, rowB, token);
916 token = genDeallocMemRef(rewriter, loc, colB, token);
917 token = genDeallocMemRef(rewriter, loc, valB, token);
918 token = genDeallocMemRef(rewriter, loc, rowC, token);
919 token = genDeallocMemRef(rewriter, loc, colC, token);
920 token = genDeallocMemRef(rewriter, loc, valC, token);
921 token = genDeallocMemRef(rewriter, loc, buffer1, token);
922 token = genDeallocMemRef(rewriter, loc, buffer2, token);
923 tokens.push_back(token);
924 genBlockingWait(rewriter, loc, tokens);
928 Value vt = rewriter.
create<bufferization::ToTensorOp>(loc, valH);
929 Value rt = rewriter.
create<bufferization::ToTensorOp>(loc, rowH);
930 Value ct = rewriter.
create<bufferization::ToTensorOp>(loc, colH);
938 linalg::GenericOp op) {
940 Value A = op.getOperand(0);
941 Value B = op.getOperand(1);
942 Value C = op.getOperand(2);
950 auto cnv =
A.getDefiningOp<ConvertOp>();
955 if (!isDenseTensor(A) || !isDenseTensor(B) || !isDenseTensor(C))
962 Value bufA = genTensorToMemref(rewriter, loc, A);
963 Value matA = genAllocCopy(rewriter, loc, bufA, tokens);
964 Value bufB = genTensorToMemref(rewriter, loc, B);
965 Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
966 Value bufC = genTensorToMemref(rewriter, loc, C);
967 Value matC = genAllocCopy(rewriter, loc, bufC, tokens);
968 genBlockingWait(rewriter, loc, tokens);
979 Value token = genFirstWait(rewriter, loc);
981 loc, spMatHandleTp, tokenTp, token, szm, szk,
982 gpu::Prune2To4SpMatFlag::PRUNE_AND_CHECK, matA);
985 auto dmatB = rewriter.
create<gpu::CreateDnTensorOp>(
986 loc, dnTensorHandleTp, tokenTp, token, matB,
988 Value dnB = dmatB.getResult(0);
989 token = dmatB.getAsyncToken();
990 auto dmatC = rewriter.
create<gpu::CreateDnTensorOp>(
991 loc, dnTensorHandleTp, tokenTp, token, matC,
993 Value dnC = dmatC.getResult(0);
994 token = dmatC.getAsyncToken();
995 auto dmatCType = llvm::cast<ShapedType>(matC.getType()).getElementType();
1000 auto bufferComp = rewriter.
create<gpu::SpMMBufferSizeOp>(
1001 loc, bufferTypes, tokenTp, token, gpu::TransposeMode::NON_TRANSPOSE,
1002 gpu::TransposeMode::NON_TRANSPOSE, spMatA, dnB, dnC,
1004 token = bufferComp.getAsyncToken();
1007 Value bufferSz1 = bufferComp.getResult(0);
1008 auto buf1 = genAllocBuffer(rewriter, loc, bufferSz1, token);
1009 Value buffer1 = buf1.getResult(0);
1010 token = buf1.getAsyncToken();
1011 Value bufferSz2 = bufferComp.getResult(1);
1012 auto buf2 = genAllocBuffer(rewriter, loc, bufferSz2, token);
1013 Value buffer2 = buf2.getResult(0);
1014 token = buf2.getAsyncToken();
1015 Value bufferSz3 = bufferComp.getResult(2);
1016 auto buf3 = genAllocBuffer(rewriter, loc, bufferSz3, token);
1017 Value buffer3 = buf3.getResult(0);
1018 token = buf3.getAsyncToken();
1021 auto dnCType = llvm::cast<ShapedType>(matC.getType()).getElementType();
1022 auto spmmComp = rewriter.
create<gpu::SpMMOp>(
1023 loc, tokenTp, token, spMatA, dnB, dnC, dnCType,
1025 token = spmmComp.getAsyncToken();
1028 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
1030 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
1032 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
1035 token = genDeallocMemRef(rewriter, loc, buffer1, token);
1036 token = genDeallocMemRef(rewriter, loc, buffer2, token);
1037 token = genDeallocMemRef(rewriter, loc, buffer3, token);
1038 token = genDeallocMemRef(rewriter, loc, matA, token);
1039 token = genDeallocMemRef(rewriter, loc, matB, token);
1040 token = genCopyMemRef(rewriter, loc, bufC, matC, token);
1041 token = genDeallocMemRef(rewriter, loc, matC, token);
1042 tokens.push_back(token);
1043 genBlockingWait(rewriter, loc, tokens);
1053 linalg::GenericOp op,
bool enableRT) {
1055 Value a = op.getOperand(0);
1056 Value b = op.getOperand(1);
1057 Value c = op.getOperand(2);
1064 auto format = getCuSparseFormat(cTp, bTp, aTp, enableRT,
false);
1065 if (format == CuSparseFormat::kNone || format == CuSparseFormat::kCOO ||
1066 format == CuSparseFormat::kCSC)
1074 Value nseC = rewriter.
create<NumberOfEntriesOp>(loc, c);
1078 Value bufA = genTensorToMemref(rewriter, loc, a);
1079 Value matA = genAllocCopy(rewriter, loc, bufA, tokens);
1080 Value bufB = genTensorToMemref(rewriter, loc, b);
1081 Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
1082 Value memR = genFirstPosOrCrds(rewriter, loc, c, format, enableRT);
1083 Value memC = genSecondCrds(rewriter, loc, c, format, enableRT);
1085 Value rowC = genAllocCopy(rewriter, loc, memR, tokens);
1086 Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) :
Value();
1087 Value valC = genAllocCopy(rewriter, loc, memV, tokens);
1088 genBlockingWait(rewriter, loc, tokens);
1096 Value token = genFirstWait(rewriter, loc);
1097 auto dmatA = rewriter.
create<gpu::CreateDnTensorOp>(
1099 Value dnA = dmatA.getResult(0);
1100 token = dmatA.getAsyncToken();
1101 auto dmatB = rewriter.
create<gpu::CreateDnTensorOp>(
1103 Value dnB = dmatB.getResult(0);
1104 token = dmatB.getAsyncToken();
1106 genSpMat(rewriter, loc, cTp, spMatHandleTp, tokenTp, token, szm, szn,
1107 nseC, rowC, colC, valC, format, enableRT);
1110 auto dnCType = llvm::cast<ShapedType>(c.
getType()).getElementType();
1113 auto bufferComp = rewriter.
create<gpu::SDDMMBufferSizeOp>(
1114 loc, indexTp, tokenTp, token, dnA, dnB, spMatC, dnCType);
1115 Value bufferSz = bufferComp.getResult(0);
1116 token = bufferComp.getAsyncToken();
1117 auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
1118 Value buffer = buf.getResult(0);
1119 token = buf.getAsyncToken();
1122 auto sddmmComp = rewriter.
create<gpu::SDDMMOp>(loc, tokenTp, token, dnA, dnB,
1123 spMatC, dnCType, buffer);
1124 token = sddmmComp.getAsyncToken();
1127 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnA)
1129 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
1131 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
1133 token = genDeallocMemRef(rewriter, loc, buffer, token);
1134 token = genDeallocMemRef(rewriter, loc, matA, token);
1135 token = genDeallocMemRef(rewriter, loc, matB, token);
1136 token = genDeallocMemRef(rewriter, loc, rowC, token);
1138 token = genDeallocMemRef(rewriter, loc, colC, token);
1139 token = genCopyMemRef(rewriter, loc, memV, valC, token);
1140 token = genDeallocMemRef(rewriter, loc, valC, token);
1141 tokens.push_back(token);
1142 genBlockingWait(rewriter, loc, tokens);
1164 LogicalResult matchAndRewrite(scf::ParallelOp forallOp,
1172 forallOp.getNumReductions() != 0 || forallOp.getNumLoops() != 1 ||
1181 Value val = o.get();
1183 if (auto arg = dyn_cast<BlockArgument>(val))
1184 block = arg.getOwner();
1186 block = val.getDefiningOp()->getBlock();
1187 if (!forallOp.getRegion().findAncestorBlockInRegion(*block))
1188 invariants.insert(val);
1196 for (
Value val : invariants) {
1199 constants.push_back(val);
1201 scalars.push_back(val);
1202 else if (isa<MemRefType>(tp))
1203 buffers.push_back(val);
1214 Value out = genParametersIn(rewriter, loc, scalars, buffers, args, tokens,
1218 ModuleOp topModule = forallOp->getParentOfType<ModuleOp>();
1219 auto gpuModule = genGPUModule(rewriter, topModule);
1220 auto gpuFunc = genGPUFunc(rewriter, gpuModule, args);
1221 genGPUCode(rewriter, gpuFunc, forallOp, constants, scalars, buffers);
1227 genBlockingWait(rewriter, loc, tokens);
1230 genLaunchGPUFunc(rewriter, gpuFunc, args, tokens, numThreads);
1232 genParametersOut(rewriter, loc, out, kernelToken, scalars, buffers, args,
1234 genBlockingWait(rewriter, loc, tokens);
1240 unsigned numThreads;
1255 LogicalResult matchAndRewrite(linalg::GenericOp op,
1257 if (op.getNumDpsInits() != 1)
1260 const unsigned numLoops = op.getNumLoops();
1261 const unsigned numTensors = op->getNumOperands();
1262 const auto iteratorTypes = op.getIteratorTypesArray();
1266 auto infer = [&](MapList m) {
1276 if (numLoops == 2 && numTensors == 3 &&
1279 maps == infer({{i,
j}, {
j}, {i}}) && matchSumOfMultOfArgs(op)) {
1280 return rewriteSpMV(rewriter, op, enableRT);
1284 if (numLoops == 3 && numTensors == 3 &&
1288 maps == infer({{i, k}, {k,
j}, {i,
j}}) && matchSumOfMultOfArgs(op)) {
1289 if (!isDenseTensor(op.getOperand(0)) && !isDenseTensor(op.getOperand(1)))
1290 return rewriteSpGEMM(rewriter, op, enableRT);
1291 if (isConversionInto24(op.getOperand(0)))
1292 return rewrite2To4SpMM(rewriter, op);
1293 return rewriteSpMM(rewriter, op, enableRT);
1297 if (numLoops == 3 && numTensors == 3 &&
1301 maps == infer({{i, k}, {k,
j}, {i,
j}}) &&
1302 matchSumReductionOfMulUnary(op)) {
1303 return rewriteSDDMM(rewriter, op, enableRT);
1326 unsigned numThreads) {
static void copy(Location loc, Value dst, Value src, Value size, OpBuilder &builder)
Copies the given number of bytes from src to dst pointers.
static MLIRContext * getContext(OpFoldResult val)
Base type for affine expression.
static SmallVector< AffineMap, 4 > inferFromExprList(ArrayRef< ArrayRef< AffineExpr >> exprsList, MLIRContext *context)
Returns a vector of AffineMaps; each with as many results as exprs.size(), as many dims as the larges...
Block represents an ordered list of Operations.
BlockArgument getArgument(unsigned i)
BlockArgListType getArguments()
Ty getType(Args &&...args)
Get or construct an instance of the type Ty with provided arguments.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
This class helps build Operations.
InsertPoint saveInsertionPoint() const
Return a saved insertion point.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void restoreInsertionPoint(InsertPoint ip)
Restore the insert point to a previously saved point.
void cloneRegionBefore(Region ®ion, Region &parent, Region::iterator before, IRMapping &mapping)
Clone the blocks that belong to "region" before the given position in another region "parent".
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents an operand of an operation.
Operation is the basic unit of execution within MLIR.
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
MutableArrayRef< OpOperand > getOpOperands()
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
virtual void eraseBlock(Block *block)
This method erases all operations in a block.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
This class provides an abstraction over the various different ranges of value types.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
bool isIntOrIndex() const
Return true if this is an integer (of any signedness) or an index type.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName()
A wrapper around RankedTensorType, which has three goals:
bool isSingletonLvl(Level l) const
Type getElementType() const
unsigned getCrdWidth() const
Returns the coordinate-overhead bitwidth, defaulting to zero.
bool hasEncoding() const
Returns true for tensors which have an encoding, and false for those which do not.
Dimension getDimRank() const
Returns the dimension-rank.
bool isNOutOfMLvl(Level l) const
Type getCrdType() const
Returns the coordinate-overhead MLIR type, defaulting to IndexType.
bool isIdentity() const
Returns true if the dimToLvl mapping is the identity.
bool isCompressedLvl(Level l) const
Level getLvlRank() const
Returns the level-rank.
unsigned getPosWidth() const
Returns the position-overhead bitwidth, defaulting to zero.
bool isPermutation() const
Returns true if the dimToLvl mapping is a permutation.
bool isDenseLvl(Level l) const
AffineMap getDimToLvl() const
Returns the dimToLvl mapping (or the null-map for the identity).
Type getPosType() const
Returns the position-overhead MLIR type, defaulting to IndexType.
bool isOrderedLvl(Level l) const
bool isUniqueLvl(Level l) const
bool isParallelIterator(utils::IteratorType iteratorType)
Check if iterator type has "parallel" semantics.
bool isReductionIterator(utils::IteratorType iteratorType)
Check if iterator type has "reduction" semantics.
Value createOrFoldDimOp(OpBuilder &b, Location loc, Value val, int64_t dim)
Create one memref::DimOp or tensor::DimOp depending on the type of val.
Value constantIndex(OpBuilder &builder, Location loc, int64_t i)
Generates a constant of index type.
SparseTensorType getSparseTensorType(Value val)
Convenience methods to obtain a SparseTensorType from a Value.
SmallVector< unsigned > getBlockSize(AffineMap dimToLvl)
Given the dimToLvl map, returns the block sizes in a vector.
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
void populateSparseGPULibgenPatterns(RewritePatternSet &patterns, bool enableRT)
detail::constant_int_predicate_matcher m_Zero()
Matches a constant scalar / vector splat / tensor splat integer zero.
const FrozenRewritePatternSet & patterns
detail::constant_int_predicate_matcher m_One()
Matches a constant scalar / vector splat / tensor splat integer one.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateSparseGPUCodegenPatterns(RewritePatternSet &patterns, unsigned numThreads)
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Utility class for the GPU dialect to represent triples of Values accessible through ....
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.