37 enum class CuSparseFormat {
50 static void markAsGPUContainer(ModuleOp topModule) {
51 topModule->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
57 static gpu::GPUModuleOp genGPUModule(
OpBuilder &builder, ModuleOp topModule) {
58 for (
auto op : topModule.getBodyRegion().getOps<gpu::GPUModuleOp>())
60 markAsGPUContainer(topModule);
62 return builder.
create<gpu::GPUModuleOp>(topModule->getLoc(),
67 static gpu::GPUFuncOp genGPUFunc(
OpBuilder &builder, gpu::GPUModuleOp gpuModule,
71 unsigned kernelNumber = 0;
75 (
"kernel" + Twine(kernelNumber++)).toStringRef(kernelName);
76 }
while (gpuModule.lookupSymbol(kernelName));
81 argsTp.push_back(arg.getType());
84 builder.
create<gpu::GPUFuncOp>(gpuModule->getLoc(), kernelName, type);
85 gpuFunc->
setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
91 static Value genLaunchGPUFunc(
OpBuilder &builder, gpu::GPUFuncOp gpuFunc,
94 unsigned numThreads) {
102 .
create<gpu::LaunchFuncOp>(loc, gpuFunc, gridSize, blckSize,
115 MemRefType memTp = cast<MemRefType>(mem.
getType());
118 Value cast = builder.
create<memref::CastOp>(loc, resTp, mem);
119 builder.
create<gpu::HostRegisterOp>(loc, cast);
126 builder.
create<gpu::HostUnregisterOp>(loc, cast);
139 builder.
create<gpu::WaitOp>(loc,
Type(), operands);
148 auto tp = cast<ShapedType>(mem.
getType());
149 auto elemTp = tp.getElementType();
150 auto shape = tp.getShape();
153 for (
unsigned r = 0, rank = tp.getRank(); r < rank; r++) {
154 if (shape[r] == ShapedType::kDynamic) {
156 dynamicSizes.push_back(dimOp);
167 return builder.
create<memref::AllocOp>(loc, memTp, size).getResult();
181 return genAllocBuffer(builder, loc, builder.
getI8Type(), size, token);
187 return builder.
create<gpu::DeallocOp>(loc, token.
getType(), token, mem)
194 return builder.
create<gpu::MemcpyOp>(loc, token.
getType(), token, dst, src)
201 Value firstToken = genFirstWait(builder, loc);
202 auto alloc = genAllocMemRef(builder, loc, b, firstToken);
203 Value devMem = alloc.getResult(0);
204 Value depToken = alloc.getAsyncToken();
205 tokens.push_back(genCopyMemRef(builder, loc, devMem, b, depToken));
212 auto tensorType = llvm::cast<ShapedType>(tensor.
getType());
215 return rewriter.
create<bufferization::ToBufferOp>(loc, memrefType, tensor);
227 bool useHostRegistrationForOut) {
230 for (
Value s : scalars)
233 for (
Value b : buffers) {
234 if (useHostRegistrationForOut) {
235 out = genHostRegisterMemref(builder, loc, b);
237 useHostRegistrationForOut =
false;
240 args.push_back(genAllocCopy(builder, loc, b, tokens));
253 unsigned base = scalars.size();
254 for (
unsigned i = base, e = args.size(); i < e; i++) {
259 genHostUnregisterMemref(builder, loc, out);
264 genCopyMemRef(builder, loc, buffers[0], args[i], kernelToken);
266 firstToken = genFirstWait(builder, loc);
268 tokens.push_back(genDeallocMemRef(builder, loc, args[i], firstToken));
273 static void genGPUCode(
PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc,
274 scf::ParallelOp forallOp,
285 for (
Value c : constants)
287 for (
Value s : scalars)
289 for (
Value b : buffers)
296 Value bid = rewriter.
create<gpu::BlockIdOp>(loc, gpu::Dimension::x);
297 Value bsz = rewriter.
create<gpu::BlockDimOp>(loc, gpu::Dimension::x);
298 Value tid = rewriter.
create<gpu::ThreadIdOp>(loc, gpu::Dimension::x);
299 Value gsz = rewriter.
create<gpu::GridDimOp>(loc, gpu::Dimension::x);
300 Value mul = rewriter.
create<arith::MulIOp>(loc, bid, bsz);
301 Value row = rewriter.
create<arith::AddIOp>(loc, mul, tid);
302 Value inc = rewriter.
create<arith::MulIOp>(loc, bsz, gsz);
310 Value upper = irMap.
lookup(forallOp.getUpperBound()[0]);
311 scf::ForOp forOp = rewriter.
create<scf::ForOp>(loc, row, upper, inc);
317 forOp.getRegion().begin(), irMap);
324 rewriter.
create<gpu::ReturnOp>(gpuFunc->getLoc());
332 static bool matchAddOfArgs(
Block *block,
Value val) {
334 if (isa<arith::AddFOp, arith::AddIOp>(def)) {
337 return (def->getOperand(0) == a && def->getOperand(1) == b) ||
338 (def->getOperand(0) == b && def->getOperand(1) == a);
345 static bool matchMulOfArgs(
Block *block,
Value val) {
347 if (isa<arith::MulFOp, arith::MulIOp>(def)) {
350 return (def->getOperand(0) == a && def->getOperand(1) == b) ||
351 (def->getOperand(0) == b && def->getOperand(1) == a);
358 static bool matchSumOfMultOfArgs(linalg::GenericOp op) {
359 auto yieldOp = cast<linalg::YieldOp>(op.getRegion().front().getTerminator());
360 if (
auto *def = yieldOp.getOperand(0).getDefiningOp()) {
361 if (isa<arith::AddFOp, arith::AddIOp>(def)) {
362 Value x = op.getBlock()->getArguments()[2];
363 return (def->getOperand(0) == x &&
364 matchMulOfArgs(op.getBlock(), def->getOperand(1))) ||
365 (def->getOperand(1) == x &&
366 matchMulOfArgs(op.getBlock(), def->getOperand(0)));
373 static bool matchSumReductionOfMulUnary(linalg::GenericOp op) {
374 auto yieldOp = cast<linalg::YieldOp>(op.getRegion().front().getTerminator());
376 Value s_out = op.getBlock()->getArguments()[2];
378 yieldOp.getOperand(0).getDefiningOp<sparse_tensor::ReduceOp>()) {
381 if (s_out == redOp->getOperand(0))
382 other = redOp->getOperand(1);
383 else if (s_out == redOp->getOperand(1))
384 other = redOp->getOperand(0);
389 if (
auto unOp = other.
getDefiningOp<sparse_tensor::UnaryOp>()) {
390 if (s_out != unOp->getOperand(0) || !unOp.getAbsentRegion().empty())
393 auto yieldUn = cast<sparse_tensor::YieldOp>(
394 unOp.getRegion(0).front().getTerminator());
395 auto yieldRed = cast<sparse_tensor::YieldOp>(
396 redOp.getRegion().front().getTerminator());
397 return matchMulOfArgs(op.getBlock(), yieldUn.getOperand(0)) &&
398 matchAddOfArgs(&redOp.getRegion().front(), yieldRed.getOperand(0));
405 static bool isDenseTensor(
Value v) {
407 return sTp.getDimRank() == sTp.getLvlRank() && sTp.isAllDense();
421 isAdmissibleMetaData(aTp);
445 assert(dims.size() == 2);
446 return dims[0] == dims[1] && dims[0] > 1;
458 static bool isConversionInto24(
Value v) {
460 Value a = cnv.getResult();
461 Value d = cnv.getSource();
463 return isDenseTensor(d) && isAdmissible24(aTp);
476 return CuSparseFormat::kNone;
478 if (isAdmissibleCOO(aTp))
479 #ifdef CUSPARSE_COO_AOS
480 return isMatVec ? CuSparseFormat::kCOO : CuSparseFormat::kNone;
482 return enableRT ? CuSparseFormat::kCOO : CuSparseFormat::kNone;
484 if (isAdmissibleCSR(aTp))
485 return CuSparseFormat::kCSR;
486 if (isAdmissibleCSC(aTp))
487 return CuSparseFormat::kCSC;
488 if (isAdmissibleBSR(aTp))
489 return CuSparseFormat::kBSR;
490 return CuSparseFormat::kNone;
495 CuSparseFormat format,
bool enableRT) {
496 if (format == CuSparseFormat::kCOO) {
499 return builder.
create<ToCoordinatesOp>(loc, a, 0);
500 return builder.
create<ToCoordinatesBufferOp>(loc, a);
503 return builder.
create<ToPositionsOp>(loc, a, 1);
508 CuSparseFormat format,
bool enableRT) {
509 bool isCOO = format == CuSparseFormat::kCOO;
510 if (isCOO && !enableRT)
513 return builder.
create<ToCoordinatesOp>(loc, a, 1);
521 CuSparseFormat format,
bool enableRT) {
522 if (format == CuSparseFormat::kCOO) {
526 return builder.
create<gpu::CreateCooOp>(loc, handleTp, tokenTp, token,
527 sz1, sz2, nseA, rowA, colA, valA);
529 #ifdef CUSPARSE_COO_AOS
531 return builder.
create<gpu::CreateCooAoSOp>(loc, handleTp, tokenTp, token,
532 sz1, sz2, nseA, rowA, valA);
534 llvm_unreachable(
"gpu::CreateCooAoSOp is deprecated");
538 if (format == CuSparseFormat::kCSR)
539 return builder.
create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, sz1,
540 sz2, nseA, rowA, colA, valA);
541 if (format == CuSparseFormat::kCSC)
542 return builder.
create<gpu::CreateCscOp>(loc, handleTp, tokenTp, token, sz1,
543 sz2, nseA, rowA, colA, valA);
547 assert(format == CuSparseFormat::kBSR);
549 assert(dims.size() == 2 && dims[0] == dims[1]);
550 uint64_t b = dims[0];
552 Value bRows = builder.
create<arith::DivUIOp>(loc, sz1, bSz);
553 Value bCols = builder.
create<arith::DivUIOp>(loc, sz2, bSz);
556 return builder.
create<gpu::CreateBsrOp>(loc, handleTp, tokenTp, token, bRows,
557 bCols, bNum, bSz, bSz, rowA, colA,
563 linalg::GenericOp op,
bool enableRT) {
565 Value a = op.getOperand(0);
566 Value x = op.getOperand(1);
567 Value y = op.getOperand(2);
574 auto format = getCuSparseFormat(aTp, xTp, yTp, enableRT,
true);
575 if (format == CuSparseFormat::kNone || format == CuSparseFormat::kBSR)
582 Value nseA = rewriter.
create<NumberOfEntriesOp>(loc, a);
585 Value memR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
586 Value memC = genSecondCrds(rewriter, loc, a, format, enableRT);
588 Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
589 Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) :
Value();
590 Value valA = genAllocCopy(rewriter, loc, memV, tokens);
591 Value memX = genTensorToMemref(rewriter, loc, x);
592 Value vecX = genAllocCopy(rewriter, loc, memX, tokens);
593 Value memY = genTensorToMemref(rewriter, loc, y);
594 Value vecY = genAllocCopy(rewriter, loc, memY, tokens);
595 genBlockingWait(rewriter, loc, tokens);
603 Value token = genFirstWait(rewriter, loc);
605 genSpMat(rewriter, loc, aTp, spmatHandleTp, tokenTp, token, szY, szX,
606 nseA, rowA, colA, valA, format, enableRT);
609 auto dvecX = rewriter.
create<gpu::CreateDnTensorOp>(
610 loc, dnTensorHandleTp, tokenTp, token, vecX, szX);
611 Value dnX = dvecX.getResult(0);
612 token = dvecX.getAsyncToken();
613 auto dvecY = rewriter.
create<gpu::CreateDnTensorOp>(
614 loc, dnTensorHandleTp, tokenTp, token, vecY, szY);
615 Value dnY = dvecY.getResult(0);
616 token = dvecY.getAsyncToken();
617 auto dnYType = llvm::cast<ShapedType>(y.
getType()).getElementType();
620 auto bufferComp = rewriter.
create<gpu::SpMVBufferSizeOp>(
621 loc, indexTp, tokenTp, token, spMatA, dnX, dnY,
623 Value bufferSz = bufferComp.getResult(0);
624 token = bufferComp.getAsyncToken();
625 auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
626 Value buffer = buf.getResult(0);
627 token = buf.getAsyncToken();
630 auto spmvComp = rewriter.
create<gpu::SpMVOp>(
631 loc, tokenTp, token, spMatA, dnX, dnY, dnYType, buffer);
632 token = spmvComp.getAsyncToken();
635 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
637 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnX)
639 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnY)
641 token = genDeallocMemRef(rewriter, loc, rowA, token);
643 token = genDeallocMemRef(rewriter, loc, colA, token);
644 token = genDeallocMemRef(rewriter, loc, valA, token);
645 token = genDeallocMemRef(rewriter, loc, buffer, token);
646 token = genDeallocMemRef(rewriter, loc, vecX, token);
647 token = genCopyMemRef(rewriter, loc, memY, vecY, token);
648 token = genDeallocMemRef(rewriter, loc, vecY, token);
649 tokens.push_back(token);
650 genBlockingWait(rewriter, loc, tokens);
660 linalg::GenericOp op,
bool enableRT) {
662 Value a = op.getOperand(0);
663 Value b = op.getOperand(1);
664 Value c = op.getOperand(2);
671 auto format = getCuSparseFormat(aTp, bTp, cTp, enableRT,
false);
672 if (format == CuSparseFormat::kNone || format == CuSparseFormat::kBSR)
679 Value nseA = rewriter.
create<NumberOfEntriesOp>(loc, a);
683 Value memR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
684 Value memC = genSecondCrds(rewriter, loc, a, format, enableRT);
686 Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
687 Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) :
Value();
688 Value valA = genAllocCopy(rewriter, loc, memV, tokens);
689 Value bufB = genTensorToMemref(rewriter, loc, b);
690 Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
691 Value bufC = genTensorToMemref(rewriter, loc, c);
692 Value matC = genAllocCopy(rewriter, loc, bufC, tokens);
693 genBlockingWait(rewriter, loc, tokens);
701 Value token = genFirstWait(rewriter, loc);
703 genSpMat(rewriter, loc, aTp, spMatHandleTp, tokenTp, token, szm, szk,
704 nseA, rowA, colA, valA, format, enableRT);
707 auto dmatB = rewriter.
create<gpu::CreateDnTensorOp>(
708 loc, dnTensorHandleTp, tokenTp, token, matB,
710 Value dnB = dmatB.getResult(0);
711 token = dmatB.getAsyncToken();
712 auto dmatC = rewriter.
create<gpu::CreateDnTensorOp>(
713 loc, dnTensorHandleTp, tokenTp, token, matC,
715 Value dnC = dmatC.getResult(0);
716 token = dmatC.getAsyncToken();
717 auto dmatCType = llvm::cast<ShapedType>(c.
getType()).getElementType();
720 auto bufferComp = rewriter.
create<gpu::SpMMBufferSizeOp>(
721 loc, indexTp, tokenTp, token, spMatA, dnB, dnC,
723 Value bufferSz = bufferComp.getResult(0);
724 token = bufferComp.getAsyncToken();
725 auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
726 Value buffer = buf.getResult(0);
727 token = buf.getAsyncToken();
728 auto dnCType = llvm::cast<ShapedType>(c.
getType()).getElementType();
731 auto spmmComp = rewriter.
create<gpu::SpMMOp>(
732 loc, tokenTp, token, spMatA, dnB, dnC, dnCType, buffer);
733 token = spmmComp.getAsyncToken();
736 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
738 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
740 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
742 token = genDeallocMemRef(rewriter, loc, rowA, token);
744 token = genDeallocMemRef(rewriter, loc, colA, token);
745 token = genDeallocMemRef(rewriter, loc, valA, token);
746 token = genDeallocMemRef(rewriter, loc, buffer, token);
747 token = genDeallocMemRef(rewriter, loc, matB, token);
748 token = genCopyMemRef(rewriter, loc, bufC, matC, token);
749 token = genDeallocMemRef(rewriter, loc, matC, token);
750 tokens.push_back(token);
751 genBlockingWait(rewriter, loc, tokens);
761 linalg::GenericOp op,
bool enableRT) {
763 Value a = op.getOperand(0);
764 Value b = op.getOperand(1);
765 Value c = op.getOperand(2);
769 auto format = CuSparseFormat::kCSR;
773 if (!isAdmissibleCSR(aTp) || !isAdmissibleCSR(bTp) || !isAdmissibleCSR(cTp))
781 Value nseA = rewriter.
create<NumberOfEntriesOp>(loc, a);
782 Value nseB = rewriter.
create<NumberOfEntriesOp>(loc, b);
786 Value amemR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
787 Value amemC = genSecondCrds(rewriter, loc, a, format, enableRT);
789 Value bmemR = genFirstPosOrCrds(rewriter, loc, b, format, enableRT);
790 Value bmemC = genSecondCrds(rewriter, loc, b, format, enableRT);
792 Value rowA = genAllocCopy(rewriter, loc, amemR, tokens);
793 Value colA = genAllocCopy(rewriter, loc, amemC, tokens);
794 Value valA = genAllocCopy(rewriter, loc, amemV, tokens);
795 Value rowB = genAllocCopy(rewriter, loc, bmemR, tokens);
796 Value colB = genAllocCopy(rewriter, loc, bmemC, tokens);
797 Value valB = genAllocCopy(rewriter, loc, bmemV, tokens);
798 genBlockingWait(rewriter, loc, tokens);
806 Value token = genFirstWait(rewriter, loc);
808 genSpMat(rewriter, loc, aTp, spmatHandleTp, tokenTp, token, szm, szk,
809 nseA, rowA, colA, valA, format, enableRT);
813 genSpMat(rewriter, loc, bTp, spmatHandleTp, tokenTp, token, szk, szn,
814 nseB, rowB, colB, valB, format, enableRT);
821 Value mplus1 = rewriter.
create<arith::AddIOp>(loc, szm, one);
822 auto e1 = genAllocBuffer(rewriter, loc, cTp.
getPosType(), mplus1, token);
823 Value rowC = e1.getResult(0);
824 token = e1.getAsyncToken();
825 auto e2 = genAllocBuffer(rewriter, loc, cTp.
getCrdType(), zero, token);
826 Value colC = e2.getResult(0);
827 token = e2.getAsyncToken();
828 auto e3 = genAllocBuffer(rewriter, loc, dnCType, zero, token);
829 Value valC = e3.getResult(0);
830 token = e3.getAsyncToken();
832 genSpMat(rewriter, loc, cTp, spmatHandleTp, tokenTp, token, szm, szn,
833 zero, rowC, colC, valC, format, enableRT);
839 rewriter.
create<gpu::SpGEMMCreateDescrOp>(loc, descTp, tokenTp, token);
842 Operation *work1 = rewriter.
create<gpu::SpGEMMWorkEstimationOrComputeOp>(
843 loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
844 gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType, zero,
845 valC, gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION);
848 auto buf1 = genAllocBuffer(rewriter, loc, bufferSz1, token);
849 Value buffer1 = buf1.getResult(0);
850 token = buf1.getAsyncToken();
851 Operation *work2 = rewriter.
create<gpu::SpGEMMWorkEstimationOrComputeOp>(
852 loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
853 gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType,
855 gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION);
859 Operation *compute1 = rewriter.
create<gpu::SpGEMMWorkEstimationOrComputeOp>(
860 loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
861 gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType, zero,
862 valC, gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);
865 auto buf2 = genAllocBuffer(rewriter, loc, bufferSz2, token);
866 Value buffer2 = buf2.getResult(0);
867 token = buf2.getAsyncToken();
868 Operation *compute2 = rewriter.
create<gpu::SpGEMMWorkEstimationOrComputeOp>(
869 loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
870 gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType,
871 bufferSz2, buffer2, gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);
876 loc, indexTp, indexTp, indexTp, tokenTp, token, spMatC);
879 auto a2 = genAllocBuffer(rewriter, loc, cTp.
getCrdType(), nnz, token);
880 colC = a2.getResult(0);
881 token = a2.getAsyncToken();
882 auto a3 = genAllocBuffer(rewriter, loc, dnCType, nnz, token);
883 valC = a3.getResult(0);
884 token = a3.getAsyncToken();
888 loc, tokenTp, token, spMatC, rowC, colC, valC);
891 loc, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
892 gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType);
893 token =
copy->getResult(0);
898 Value valH = genHostBuffer(rewriter, loc, dnCType, nnz);
901 token = rewriter.
create<gpu::SpGEMMDestroyDescrOp>(loc, tokenTp, token, desc)
903 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
905 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatB)
907 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
909 token = genCopyMemRef(rewriter, loc, rowH, rowC, token);
910 token = genCopyMemRef(rewriter, loc, colH, colC, token);
911 token = genCopyMemRef(rewriter, loc, valH, valC, token);
912 token = genDeallocMemRef(rewriter, loc, rowA, token);
913 token = genDeallocMemRef(rewriter, loc, colA, token);
914 token = genDeallocMemRef(rewriter, loc, valA, token);
915 token = genDeallocMemRef(rewriter, loc, rowB, token);
916 token = genDeallocMemRef(rewriter, loc, colB, token);
917 token = genDeallocMemRef(rewriter, loc, valB, token);
918 token = genDeallocMemRef(rewriter, loc, rowC, token);
919 token = genDeallocMemRef(rewriter, loc, colC, token);
920 token = genDeallocMemRef(rewriter, loc, valC, token);
921 token = genDeallocMemRef(rewriter, loc, buffer1, token);
922 token = genDeallocMemRef(rewriter, loc, buffer2, token);
923 tokens.push_back(token);
924 genBlockingWait(rewriter, loc, tokens);
928 Value vt = rewriter.
create<bufferization::ToTensorOp>(
930 Value rt = rewriter.
create<bufferization::ToTensorOp>(
932 Value ct = rewriter.
create<bufferization::ToTensorOp>(
941 linalg::GenericOp op) {
943 Value A = op.getOperand(0);
944 Value B = op.getOperand(1);
945 Value C = op.getOperand(2);
953 auto cnv =
A.getDefiningOp<ConvertOp>();
958 if (!isDenseTensor(A) || !isDenseTensor(B) || !isDenseTensor(C))
965 Value bufA = genTensorToMemref(rewriter, loc, A);
966 Value matA = genAllocCopy(rewriter, loc, bufA, tokens);
967 Value bufB = genTensorToMemref(rewriter, loc, B);
968 Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
969 Value bufC = genTensorToMemref(rewriter, loc, C);
970 Value matC = genAllocCopy(rewriter, loc, bufC, tokens);
971 genBlockingWait(rewriter, loc, tokens);
982 Value token = genFirstWait(rewriter, loc);
984 loc, spMatHandleTp, tokenTp, token, szm, szk,
985 gpu::Prune2To4SpMatFlag::PRUNE_AND_CHECK, matA);
988 auto dmatB = rewriter.
create<gpu::CreateDnTensorOp>(
989 loc, dnTensorHandleTp, tokenTp, token, matB,
991 Value dnB = dmatB.getResult(0);
992 token = dmatB.getAsyncToken();
993 auto dmatC = rewriter.
create<gpu::CreateDnTensorOp>(
994 loc, dnTensorHandleTp, tokenTp, token, matC,
996 Value dnC = dmatC.getResult(0);
997 token = dmatC.getAsyncToken();
998 auto dmatCType = llvm::cast<ShapedType>(matC.getType()).getElementType();
1003 auto bufferComp = rewriter.
create<gpu::SpMMBufferSizeOp>(
1004 loc, bufferTypes, tokenTp, token, gpu::TransposeMode::NON_TRANSPOSE,
1005 gpu::TransposeMode::NON_TRANSPOSE, spMatA, dnB, dnC,
1007 token = bufferComp.getAsyncToken();
1010 Value bufferSz1 = bufferComp.getResult(0);
1011 auto buf1 = genAllocBuffer(rewriter, loc, bufferSz1, token);
1012 Value buffer1 = buf1.getResult(0);
1013 token = buf1.getAsyncToken();
1014 Value bufferSz2 = bufferComp.getResult(1);
1015 auto buf2 = genAllocBuffer(rewriter, loc, bufferSz2, token);
1016 Value buffer2 = buf2.getResult(0);
1017 token = buf2.getAsyncToken();
1018 Value bufferSz3 = bufferComp.getResult(2);
1019 auto buf3 = genAllocBuffer(rewriter, loc, bufferSz3, token);
1020 Value buffer3 = buf3.getResult(0);
1021 token = buf3.getAsyncToken();
1024 auto dnCType = llvm::cast<ShapedType>(matC.getType()).getElementType();
1025 auto spmmComp = rewriter.
create<gpu::SpMMOp>(
1026 loc, tokenTp, token, spMatA, dnB, dnC, dnCType,
1028 token = spmmComp.getAsyncToken();
1031 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
1033 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
1035 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
1037 token = genDeallocMemRef(rewriter, loc, buffer1, token);
1038 token = genDeallocMemRef(rewriter, loc, buffer2, token);
1039 token = genDeallocMemRef(rewriter, loc, buffer3, token);
1040 token = genDeallocMemRef(rewriter, loc, matA, token);
1041 token = genDeallocMemRef(rewriter, loc, matB, token);
1042 token = genCopyMemRef(rewriter, loc, bufC, matC, token);
1043 token = genDeallocMemRef(rewriter, loc, matC, token);
1044 tokens.push_back(token);
1045 genBlockingWait(rewriter, loc, tokens);
1055 linalg::GenericOp op,
bool enableRT) {
1057 Value a = op.getOperand(0);
1058 Value b = op.getOperand(1);
1059 Value c = op.getOperand(2);
1066 auto format = getCuSparseFormat(cTp, bTp, aTp, enableRT,
false);
1067 if (format == CuSparseFormat::kNone || format == CuSparseFormat::kCOO ||
1068 format == CuSparseFormat::kCSC)
1076 Value nseC = rewriter.
create<NumberOfEntriesOp>(loc, c);
1080 Value bufA = genTensorToMemref(rewriter, loc, a);
1081 Value matA = genAllocCopy(rewriter, loc, bufA, tokens);
1082 Value bufB = genTensorToMemref(rewriter, loc, b);
1083 Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
1084 Value memR = genFirstPosOrCrds(rewriter, loc, c, format, enableRT);
1085 Value memC = genSecondCrds(rewriter, loc, c, format, enableRT);
1087 Value rowC = genAllocCopy(rewriter, loc, memR, tokens);
1088 Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) :
Value();
1089 Value valC = genAllocCopy(rewriter, loc, memV, tokens);
1090 genBlockingWait(rewriter, loc, tokens);
1098 Value token = genFirstWait(rewriter, loc);
1099 auto dmatA = rewriter.
create<gpu::CreateDnTensorOp>(
1101 Value dnA = dmatA.getResult(0);
1102 token = dmatA.getAsyncToken();
1103 auto dmatB = rewriter.
create<gpu::CreateDnTensorOp>(
1105 Value dnB = dmatB.getResult(0);
1106 token = dmatB.getAsyncToken();
1108 genSpMat(rewriter, loc, cTp, spMatHandleTp, tokenTp, token, szm, szn,
1109 nseC, rowC, colC, valC, format, enableRT);
1112 auto dnCType = llvm::cast<ShapedType>(c.
getType()).getElementType();
1115 auto bufferComp = rewriter.
create<gpu::SDDMMBufferSizeOp>(
1116 loc, indexTp, tokenTp, token, dnA, dnB, spMatC, dnCType);
1117 Value bufferSz = bufferComp.getResult(0);
1118 token = bufferComp.getAsyncToken();
1119 auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
1120 Value buffer = buf.getResult(0);
1121 token = buf.getAsyncToken();
1124 auto sddmmComp = rewriter.
create<gpu::SDDMMOp>(loc, tokenTp, token, dnA, dnB,
1125 spMatC, dnCType, buffer);
1126 token = sddmmComp.getAsyncToken();
1129 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnA)
1131 token = rewriter.
create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
1133 token = rewriter.
create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
1135 token = genDeallocMemRef(rewriter, loc, buffer, token);
1136 token = genDeallocMemRef(rewriter, loc, matA, token);
1137 token = genDeallocMemRef(rewriter, loc, matB, token);
1138 token = genDeallocMemRef(rewriter, loc, rowC, token);
1140 token = genDeallocMemRef(rewriter, loc, colC, token);
1141 token = genCopyMemRef(rewriter, loc, memV, valC, token);
1142 token = genDeallocMemRef(rewriter, loc, valC, token);
1143 tokens.push_back(token);
1144 genBlockingWait(rewriter, loc, tokens);
1166 LogicalResult matchAndRewrite(scf::ParallelOp forallOp,
1174 forallOp.getNumReductions() != 0 || forallOp.getNumLoops() != 1 ||
1183 Value val = o.get();
1185 if (auto arg = dyn_cast<BlockArgument>(val))
1186 block = arg.getOwner();
1188 block = val.getDefiningOp()->getBlock();
1189 if (!forallOp.getRegion().findAncestorBlockInRegion(*block))
1190 invariants.insert(val);
1198 for (
Value val : invariants) {
1201 constants.push_back(val);
1203 scalars.push_back(val);
1204 else if (isa<MemRefType>(tp))
1205 buffers.push_back(val);
1216 Value out = genParametersIn(rewriter, loc, scalars, buffers, args, tokens,
1220 ModuleOp topModule = forallOp->getParentOfType<ModuleOp>();
1221 auto gpuModule = genGPUModule(rewriter, topModule);
1222 auto gpuFunc = genGPUFunc(rewriter, gpuModule, args);
1223 genGPUCode(rewriter, gpuFunc, forallOp, constants, scalars, buffers);
1229 genBlockingWait(rewriter, loc, tokens);
1232 genLaunchGPUFunc(rewriter, gpuFunc, args, tokens, numThreads);
1234 genParametersOut(rewriter, loc, out, kernelToken, scalars, buffers, args,
1236 genBlockingWait(rewriter, loc, tokens);
1242 unsigned numThreads;
1257 LogicalResult matchAndRewrite(linalg::GenericOp op,
1259 if (op.getNumDpsInits() != 1)
1262 const unsigned numLoops = op.getNumLoops();
1263 const unsigned numTensors = op->getNumOperands();
1264 const auto iteratorTypes = op.getIteratorTypesArray();
1268 auto infer = [&](MapList m) {
1278 if (numLoops == 2 && numTensors == 3 &&
1281 maps == infer({{i,
j}, {
j}, {i}}) && matchSumOfMultOfArgs(op)) {
1282 return rewriteSpMV(rewriter, op, enableRT);
1286 if (numLoops == 3 && numTensors == 3 &&
1290 maps == infer({{i, k}, {k,
j}, {i,
j}}) && matchSumOfMultOfArgs(op)) {
1291 if (!isDenseTensor(op.getOperand(0)) && !isDenseTensor(op.getOperand(1)))
1292 return rewriteSpGEMM(rewriter, op, enableRT);
1293 if (isConversionInto24(op.getOperand(0)))
1294 return rewrite2To4SpMM(rewriter, op);
1295 return rewriteSpMM(rewriter, op, enableRT);
1299 if (numLoops == 3 && numTensors == 3 &&
1303 maps == infer({{i, k}, {k,
j}, {i,
j}}) &&
1304 matchSumReductionOfMulUnary(op)) {
1305 return rewriteSDDMM(rewriter, op, enableRT);
1328 unsigned numThreads) {
static void copy(Location loc, Value dst, Value src, Value size, OpBuilder &builder)
Copies the given number of bytes from src to dst pointers.
static MLIRContext * getContext(OpFoldResult val)
Base type for affine expression.
static SmallVector< AffineMap, 4 > inferFromExprList(ArrayRef< ArrayRef< AffineExpr >> exprsList, MLIRContext *context)
Returns a vector of AffineMaps; each with as many results as exprs.size(), as many dims as the larges...
Block represents an ordered list of Operations.
BlockArgument getArgument(unsigned i)
BlockArgListType getArguments()
Ty getType(Args &&...args)
Get or construct an instance of the type Ty with provided arguments.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
This class helps build Operations.
InsertPoint saveInsertionPoint() const
Return a saved insertion point.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void restoreInsertionPoint(InsertPoint ip)
Restore the insert point to a previously saved point.
void cloneRegionBefore(Region ®ion, Region &parent, Region::iterator before, IRMapping &mapping)
Clone the blocks that belong to "region" before the given position in another region "parent".
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents an operand of an operation.
Operation is the basic unit of execution within MLIR.
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
MutableArrayRef< OpOperand > getOpOperands()
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
virtual void eraseBlock(Block *block)
This method erases all operations in a block.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
This class provides an abstraction over the various different ranges of value types.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
bool isIntOrIndex() const
Return true if this is an integer (of any signedness) or an index type.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName()
A wrapper around RankedTensorType, which has three goals:
bool isSingletonLvl(Level l) const
Type getElementType() const
unsigned getCrdWidth() const
Returns the coordinate-overhead bitwidth, defaulting to zero.
bool hasEncoding() const
Returns true for tensors which have an encoding, and false for those which do not.
Dimension getDimRank() const
Returns the dimension-rank.
bool isNOutOfMLvl(Level l) const
Type getCrdType() const
Returns the coordinate-overhead MLIR type, defaulting to IndexType.
bool isIdentity() const
Returns true if the dimToLvl mapping is the identity.
bool isCompressedLvl(Level l) const
Level getLvlRank() const
Returns the level-rank.
unsigned getPosWidth() const
Returns the position-overhead bitwidth, defaulting to zero.
bool isPermutation() const
Returns true if the dimToLvl mapping is a permutation.
bool isDenseLvl(Level l) const
AffineMap getDimToLvl() const
Returns the dimToLvl mapping (or the null-map for the identity).
Type getPosType() const
Returns the position-overhead MLIR type, defaulting to IndexType.
bool isOrderedLvl(Level l) const
bool isUniqueLvl(Level l) const
bool isParallelIterator(utils::IteratorType iteratorType)
Check if iterator type has "parallel" semantics.
bool isReductionIterator(utils::IteratorType iteratorType)
Check if iterator type has "reduction" semantics.
Value createOrFoldDimOp(OpBuilder &b, Location loc, Value val, int64_t dim)
Create one memref::DimOp or tensor::DimOp depending on the type of val.
Type getTensorTypeFromMemRefType(Type type)
Return an unranked/ranked tensor type for the given unranked/ranked memref type.
Value constantIndex(OpBuilder &builder, Location loc, int64_t i)
Generates a constant of index type.
SparseTensorType getSparseTensorType(Value val)
Convenience methods to obtain a SparseTensorType from a Value.
SmallVector< unsigned > getBlockSize(AffineMap dimToLvl)
Given the dimToLvl map, returns the block sizes in a vector.
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
void populateSparseGPULibgenPatterns(RewritePatternSet &patterns, bool enableRT)
detail::constant_int_predicate_matcher m_Zero()
Matches a constant scalar / vector splat / tensor splat integer zero.
const FrozenRewritePatternSet & patterns
detail::constant_int_predicate_matcher m_One()
Matches a constant scalar / vector splat / tensor splat integer one.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateSparseGPUCodegenPatterns(RewritePatternSet &patterns, unsigned numThreads)
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Utility class for the GPU dialect to represent triples of Values accessible through ....
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.