37 enum class CuSparseFormat {
50 static void markAsGPUContainer(ModuleOp topModule) {
51 topModule->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
57 static gpu::GPUModuleOp genGPUModule(
OpBuilder &builder, ModuleOp topModule) {
58 for (
auto op : topModule.getBodyRegion().getOps<gpu::GPUModuleOp>())
60 markAsGPUContainer(topModule);
62 return gpu::GPUModuleOp::create(builder, topModule->getLoc(),
67 static gpu::GPUFuncOp genGPUFunc(
OpBuilder &builder, gpu::GPUModuleOp gpuModule,
71 unsigned kernelNumber = 0;
75 (
"kernel" + Twine(kernelNumber++)).toStringRef(kernelName);
76 }
while (gpuModule.lookupSymbol(kernelName));
81 argsTp.push_back(arg.getType());
84 gpu::GPUFuncOp::create(builder, gpuModule->getLoc(), kernelName, type);
85 gpuFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
91 static Value genLaunchGPUFunc(
OpBuilder &builder, gpu::GPUFuncOp gpuFunc,
94 unsigned numThreads) {
101 return gpu::LaunchFuncOp::create(builder, loc, gpuFunc, gridSize, blckSize,
115 MemRefType memTp = cast<MemRefType>(mem.
getType());
118 Value cast = memref::CastOp::create(builder, loc, resTp, mem);
119 gpu::HostRegisterOp::create(builder, loc, cast);
126 gpu::HostUnregisterOp::create(builder, loc, cast);
132 return gpu::WaitOp::create(builder, loc, tokenType,
ValueRange())
139 gpu::WaitOp::create(builder, loc,
Type(), operands);
148 auto tp = cast<ShapedType>(mem.
getType());
149 auto elemTp = tp.getElementType();
150 auto shape = tp.getShape();
153 for (
unsigned r = 0, rank = tp.getRank(); r < rank; r++) {
154 if (shape[r] == ShapedType::kDynamic) {
156 dynamicSizes.push_back(dimOp);
159 return gpu::AllocOp::create(builder, loc,
TypeRange({memTp, token.
getType()}),
167 return memref::AllocOp::create(builder, loc, memTp, size).getResult();
174 return gpu::AllocOp::create(builder, loc,
TypeRange({memTp, token.
getType()}),
181 return genAllocBuffer(builder, loc, builder.
getI8Type(), size, token);
187 return gpu::DeallocOp::create(builder, loc, token.
getType(), token, mem)
194 return gpu::MemcpyOp::create(builder, loc, token.
getType(), token, dst, src)
201 Value firstToken = genFirstWait(builder, loc);
202 auto alloc = genAllocMemRef(builder, loc, b, firstToken);
203 Value devMem = alloc.getResult(0);
204 Value depToken = alloc.getAsyncToken();
205 tokens.push_back(genCopyMemRef(builder, loc, devMem, b, depToken));
212 auto tensorType = llvm::cast<ShapedType>(tensor.
getType());
215 return bufferization::ToBufferOp::create(rewriter, loc, memrefType, tensor);
227 bool useHostRegistrationForOut) {
230 for (
Value s : scalars)
233 for (
Value b : buffers) {
234 if (useHostRegistrationForOut) {
235 out = genHostRegisterMemref(builder, loc, b);
237 useHostRegistrationForOut =
false;
240 args.push_back(genAllocCopy(builder, loc, b, tokens));
253 unsigned base = scalars.size();
254 for (
unsigned i = base, e = args.size(); i < e; i++) {
259 genHostUnregisterMemref(builder, loc, out);
264 genCopyMemRef(builder, loc, buffers[0], args[i], kernelToken);
266 firstToken = genFirstWait(builder, loc);
268 tokens.push_back(genDeallocMemRef(builder, loc, args[i], firstToken));
273 static void genGPUCode(
PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc,
274 scf::ParallelOp forallOp,
285 for (
Value c : constants)
287 for (
Value s : scalars)
289 for (
Value b : buffers)
296 Value bid = gpu::BlockIdOp::create(rewriter, loc, gpu::Dimension::x);
297 Value bsz = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::x);
298 Value tid = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x);
299 Value gsz = gpu::GridDimOp::create(rewriter, loc, gpu::Dimension::x);
300 Value mul = arith::MulIOp::create(rewriter, loc, bid, bsz);
301 Value row = arith::AddIOp::create(rewriter, loc, mul, tid);
302 Value inc = arith::MulIOp::create(rewriter, loc, bsz, gsz);
310 Value upper = irMap.
lookup(forallOp.getUpperBound()[0]);
311 scf::ForOp forOp = scf::ForOp::create(rewriter, loc, row, upper, inc);
317 forOp.getRegion().begin(), irMap);
324 gpu::ReturnOp::create(rewriter, gpuFunc->getLoc());
332 static bool matchAddOfArgs(
Block *block,
Value val) {
334 if (isa<arith::AddFOp, arith::AddIOp>(def)) {
337 return (def->getOperand(0) == a && def->getOperand(1) == b) ||
338 (def->getOperand(0) == b && def->getOperand(1) == a);
345 static bool matchMulOfArgs(
Block *block,
Value val) {
347 if (isa<arith::MulFOp, arith::MulIOp>(def)) {
350 return (def->getOperand(0) == a && def->getOperand(1) == b) ||
351 (def->getOperand(0) == b && def->getOperand(1) == a);
358 static bool matchSumOfMultOfArgs(linalg::GenericOp op) {
359 auto yieldOp = cast<linalg::YieldOp>(op.getRegion().front().getTerminator());
360 if (
auto *def = yieldOp.getOperand(0).getDefiningOp()) {
361 if (isa<arith::AddFOp, arith::AddIOp>(def)) {
362 Value x = op.getBlock()->getArguments()[2];
363 return (def->getOperand(0) == x &&
364 matchMulOfArgs(op.getBlock(), def->getOperand(1))) ||
365 (def->getOperand(1) == x &&
366 matchMulOfArgs(op.getBlock(), def->getOperand(0)));
373 static bool matchSumReductionOfMulUnary(linalg::GenericOp op) {
374 auto yieldOp = cast<linalg::YieldOp>(op.getRegion().front().getTerminator());
376 Value s_out = op.getBlock()->getArguments()[2];
378 yieldOp.getOperand(0).getDefiningOp<sparse_tensor::ReduceOp>()) {
381 if (s_out == redOp->getOperand(0))
382 other = redOp->getOperand(1);
383 else if (s_out == redOp->getOperand(1))
384 other = redOp->getOperand(0);
389 if (
auto unOp = other.
getDefiningOp<sparse_tensor::UnaryOp>()) {
390 if (s_out != unOp->getOperand(0) || !unOp.getAbsentRegion().empty())
393 auto yieldUn = cast<sparse_tensor::YieldOp>(
394 unOp.getRegion(0).front().getTerminator());
395 auto yieldRed = cast<sparse_tensor::YieldOp>(
396 redOp.getRegion().front().getTerminator());
397 return matchMulOfArgs(op.getBlock(), yieldUn.getOperand(0)) &&
398 matchAddOfArgs(&redOp.getRegion().front(), yieldRed.getOperand(0));
405 static bool isDenseTensor(
Value v) {
407 return sTp.getDimRank() == sTp.getLvlRank() && sTp.isAllDense();
421 isAdmissibleMetaData(aTp);
445 assert(dims.size() == 2);
446 return dims[0] == dims[1] && dims[0] > 1;
458 static bool isConversionInto24(
Value v) {
460 Value a = cnv.getResult();
461 Value d = cnv.getSource();
463 return isDenseTensor(d) && isAdmissible24(aTp);
476 return CuSparseFormat::kNone;
478 if (isAdmissibleCOO(aTp))
479 #ifdef CUSPARSE_COO_AOS
480 return isMatVec ? CuSparseFormat::kCOO : CuSparseFormat::kNone;
482 return enableRT ? CuSparseFormat::kCOO : CuSparseFormat::kNone;
484 if (isAdmissibleCSR(aTp))
485 return CuSparseFormat::kCSR;
486 if (isAdmissibleCSC(aTp))
487 return CuSparseFormat::kCSC;
488 if (isAdmissibleBSR(aTp))
489 return CuSparseFormat::kBSR;
490 return CuSparseFormat::kNone;
495 CuSparseFormat format,
bool enableRT) {
496 if (format == CuSparseFormat::kCOO) {
499 return ToCoordinatesOp::create(builder, loc, a, 0);
500 return ToCoordinatesBufferOp::create(builder, loc, a);
503 return ToPositionsOp::create(builder, loc, a, 1);
508 CuSparseFormat format,
bool enableRT) {
509 bool isCOO = format == CuSparseFormat::kCOO;
510 if (isCOO && !enableRT)
513 return ToCoordinatesOp::create(builder, loc, a, 1);
521 CuSparseFormat format,
bool enableRT) {
522 if (format == CuSparseFormat::kCOO) {
526 return gpu::CreateCooOp::create(builder, loc, handleTp, tokenTp, token,
527 sz1, sz2, nseA, rowA, colA, valA);
529 #ifdef CUSPARSE_COO_AOS
531 return gpu::CreateCooAoSOp::create(builder, loc, handleTp, tokenTp, token,
532 sz1, sz2, nseA, rowA, valA);
534 llvm_unreachable(
"gpu::CreateCooAoSOp is deprecated");
538 if (format == CuSparseFormat::kCSR)
539 return gpu::CreateCsrOp::create(builder, loc, handleTp, tokenTp, token, sz1,
540 sz2, nseA, rowA, colA, valA);
541 if (format == CuSparseFormat::kCSC)
542 return gpu::CreateCscOp::create(builder, loc, handleTp, tokenTp, token, sz1,
543 sz2, nseA, rowA, colA, valA);
547 assert(format == CuSparseFormat::kBSR);
549 assert(dims.size() == 2 && dims[0] == dims[1]);
550 uint64_t b = dims[0];
552 Value bRows = arith::DivUIOp::create(builder, loc, sz1, bSz);
553 Value bCols = arith::DivUIOp::create(builder, loc, sz2, bSz);
554 Value bNum = arith::DivUIOp::create(builder, loc, nseA,
556 return gpu::CreateBsrOp::create(builder, loc, handleTp, tokenTp, token, bRows,
557 bCols, bNum, bSz, bSz, rowA, colA, valA);
562 linalg::GenericOp op,
bool enableRT) {
564 Value a = op.getOperand(0);
565 Value x = op.getOperand(1);
566 Value y = op.getOperand(2);
573 auto format = getCuSparseFormat(aTp, xTp, yTp, enableRT,
true);
574 if (format == CuSparseFormat::kNone || format == CuSparseFormat::kBSR)
581 Value nseA = NumberOfEntriesOp::create(rewriter, loc, a);
584 Value memR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
585 Value memC = genSecondCrds(rewriter, loc, a, format, enableRT);
586 Value memV = ToValuesOp::create(rewriter, loc, a);
587 Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
588 Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) :
Value();
589 Value valA = genAllocCopy(rewriter, loc, memV, tokens);
590 Value memX = genTensorToMemref(rewriter, loc, x);
591 Value vecX = genAllocCopy(rewriter, loc, memX, tokens);
592 Value memY = genTensorToMemref(rewriter, loc, y);
593 Value vecY = genAllocCopy(rewriter, loc, memY, tokens);
594 genBlockingWait(rewriter, loc, tokens);
602 Value token = genFirstWait(rewriter, loc);
604 genSpMat(rewriter, loc, aTp, spmatHandleTp, tokenTp, token, szY, szX,
605 nseA, rowA, colA, valA, format, enableRT);
608 auto dvecX = gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp,
609 tokenTp, token, vecX, szX);
610 Value dnX = dvecX.getResult(0);
611 token = dvecX.getAsyncToken();
612 auto dvecY = gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp,
613 tokenTp, token, vecY, szY);
614 Value dnY = dvecY.getResult(0);
615 token = dvecY.getAsyncToken();
616 auto dnYType = llvm::cast<ShapedType>(y.
getType()).getElementType();
619 auto bufferComp = gpu::SpMVBufferSizeOp::create(
620 rewriter, loc, indexTp, tokenTp, token, spMatA, dnX, dnY,
622 Value bufferSz = bufferComp.getResult(0);
623 token = bufferComp.getAsyncToken();
624 auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
625 Value buffer = buf.getResult(0);
626 token = buf.getAsyncToken();
630 gpu::SpMVOp::create(rewriter, loc, tokenTp, token, spMatA, dnX, dnY,
632 token = spmvComp.getAsyncToken();
635 token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatA)
637 token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnX)
639 token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnY)
641 token = genDeallocMemRef(rewriter, loc, rowA, token);
643 token = genDeallocMemRef(rewriter, loc, colA, token);
644 token = genDeallocMemRef(rewriter, loc, valA, token);
645 token = genDeallocMemRef(rewriter, loc, buffer, token);
646 token = genDeallocMemRef(rewriter, loc, vecX, token);
647 token = genCopyMemRef(rewriter, loc, memY, vecY, token);
648 token = genDeallocMemRef(rewriter, loc, vecY, token);
649 tokens.push_back(token);
650 genBlockingWait(rewriter, loc, tokens);
660 linalg::GenericOp op,
bool enableRT) {
662 Value a = op.getOperand(0);
663 Value b = op.getOperand(1);
664 Value c = op.getOperand(2);
671 auto format = getCuSparseFormat(aTp, bTp, cTp, enableRT,
false);
672 if (format == CuSparseFormat::kNone || format == CuSparseFormat::kBSR)
679 Value nseA = NumberOfEntriesOp::create(rewriter, loc, a);
683 Value memR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
684 Value memC = genSecondCrds(rewriter, loc, a, format, enableRT);
685 Value memV = ToValuesOp::create(rewriter, loc, a);
686 Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
687 Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) :
Value();
688 Value valA = genAllocCopy(rewriter, loc, memV, tokens);
689 Value bufB = genTensorToMemref(rewriter, loc, b);
690 Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
691 Value bufC = genTensorToMemref(rewriter, loc, c);
692 Value matC = genAllocCopy(rewriter, loc, bufC, tokens);
693 genBlockingWait(rewriter, loc, tokens);
701 Value token = genFirstWait(rewriter, loc);
703 genSpMat(rewriter, loc, aTp, spMatHandleTp, tokenTp, token, szm, szk,
704 nseA, rowA, colA, valA, format, enableRT);
708 gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp, tokenTp,
710 Value dnB = dmatB.getResult(0);
711 token = dmatB.getAsyncToken();
713 gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp, tokenTp,
715 Value dnC = dmatC.getResult(0);
716 token = dmatC.getAsyncToken();
717 auto dmatCType = llvm::cast<ShapedType>(c.
getType()).getElementType();
720 auto bufferComp = gpu::SpMMBufferSizeOp::create(
721 rewriter, loc, indexTp, tokenTp, token, spMatA, dnB, dnC,
723 Value bufferSz = bufferComp.getResult(0);
724 token = bufferComp.getAsyncToken();
725 auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
726 Value buffer = buf.getResult(0);
727 token = buf.getAsyncToken();
728 auto dnCType = llvm::cast<ShapedType>(c.
getType()).getElementType();
732 gpu::SpMMOp::create(rewriter, loc, tokenTp, token, spMatA, dnB, dnC,
734 token = spmmComp.getAsyncToken();
737 token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatA)
739 token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnB)
741 token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnC)
743 token = genDeallocMemRef(rewriter, loc, rowA, token);
745 token = genDeallocMemRef(rewriter, loc, colA, token);
746 token = genDeallocMemRef(rewriter, loc, valA, token);
747 token = genDeallocMemRef(rewriter, loc, buffer, token);
748 token = genDeallocMemRef(rewriter, loc, matB, token);
749 token = genCopyMemRef(rewriter, loc, bufC, matC, token);
750 token = genDeallocMemRef(rewriter, loc, matC, token);
751 tokens.push_back(token);
752 genBlockingWait(rewriter, loc, tokens);
762 linalg::GenericOp op,
bool enableRT) {
764 Value a = op.getOperand(0);
765 Value b = op.getOperand(1);
766 Value c = op.getOperand(2);
770 auto format = CuSparseFormat::kCSR;
774 if (!isAdmissibleCSR(aTp) || !isAdmissibleCSR(bTp) || !isAdmissibleCSR(cTp))
782 Value nseA = NumberOfEntriesOp::create(rewriter, loc, a);
783 Value nseB = NumberOfEntriesOp::create(rewriter, loc, b);
787 Value amemR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
788 Value amemC = genSecondCrds(rewriter, loc, a, format, enableRT);
789 Value amemV = ToValuesOp::create(rewriter, loc, a);
790 Value bmemR = genFirstPosOrCrds(rewriter, loc, b, format, enableRT);
791 Value bmemC = genSecondCrds(rewriter, loc, b, format, enableRT);
792 Value bmemV = ToValuesOp::create(rewriter, loc, b);
793 Value rowA = genAllocCopy(rewriter, loc, amemR, tokens);
794 Value colA = genAllocCopy(rewriter, loc, amemC, tokens);
795 Value valA = genAllocCopy(rewriter, loc, amemV, tokens);
796 Value rowB = genAllocCopy(rewriter, loc, bmemR, tokens);
797 Value colB = genAllocCopy(rewriter, loc, bmemC, tokens);
798 Value valB = genAllocCopy(rewriter, loc, bmemV, tokens);
799 genBlockingWait(rewriter, loc, tokens);
807 Value token = genFirstWait(rewriter, loc);
809 genSpMat(rewriter, loc, aTp, spmatHandleTp, tokenTp, token, szm, szk,
810 nseA, rowA, colA, valA, format, enableRT);
814 genSpMat(rewriter, loc, bTp, spmatHandleTp, tokenTp, token, szk, szn,
815 nseB, rowB, colB, valB, format, enableRT);
822 Value mplus1 = arith::AddIOp::create(rewriter, loc, szm, one);
823 auto e1 = genAllocBuffer(rewriter, loc, cTp.
getPosType(), mplus1, token);
824 Value rowC = e1.getResult(0);
825 token = e1.getAsyncToken();
826 auto e2 = genAllocBuffer(rewriter, loc, cTp.
getCrdType(), zero, token);
827 Value colC = e2.getResult(0);
828 token = e2.getAsyncToken();
829 auto e3 = genAllocBuffer(rewriter, loc, dnCType, zero, token);
830 Value valC = e3.getResult(0);
831 token = e3.getAsyncToken();
833 genSpMat(rewriter, loc, cTp, spmatHandleTp, tokenTp, token, szm, szn,
834 zero, rowC, colC, valC, format, enableRT);
840 gpu::SpGEMMCreateDescrOp::create(rewriter, loc, descTp, tokenTp, token);
843 Operation *work1 = gpu::SpGEMMWorkEstimationOrComputeOp::create(
844 rewriter, loc, indexTp, tokenTp, token, desc,
845 gpu::TransposeMode::NON_TRANSPOSE, gpu::TransposeMode::NON_TRANSPOSE,
846 spMatA, spMatB, spMatC, dnCType, zero, valC,
847 gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION);
850 auto buf1 = genAllocBuffer(rewriter, loc, bufferSz1, token);
851 Value buffer1 = buf1.getResult(0);
852 token = buf1.getAsyncToken();
853 Operation *work2 = gpu::SpGEMMWorkEstimationOrComputeOp::create(
854 rewriter, loc, indexTp, tokenTp, token, desc,
855 gpu::TransposeMode::NON_TRANSPOSE, gpu::TransposeMode::NON_TRANSPOSE,
856 spMatA, spMatB, spMatC, dnCType, bufferSz1, buffer1,
857 gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION);
861 Operation *compute1 = gpu::SpGEMMWorkEstimationOrComputeOp::create(
862 rewriter, loc, indexTp, tokenTp, token, desc,
863 gpu::TransposeMode::NON_TRANSPOSE, gpu::TransposeMode::NON_TRANSPOSE,
864 spMatA, spMatB, spMatC, dnCType, zero, valC,
865 gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);
868 auto buf2 = genAllocBuffer(rewriter, loc, bufferSz2, token);
869 Value buffer2 = buf2.getResult(0);
870 token = buf2.getAsyncToken();
871 Operation *compute2 = gpu::SpGEMMWorkEstimationOrComputeOp::create(
872 rewriter, loc, indexTp, tokenTp, token, desc,
873 gpu::TransposeMode::NON_TRANSPOSE, gpu::TransposeMode::NON_TRANSPOSE,
874 spMatA, spMatB, spMatC, dnCType, bufferSz2, buffer2,
875 gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);
879 Operation *sizes = gpu::SpMatGetSizeOp::create(
880 rewriter, loc, indexTp, indexTp, indexTp, tokenTp, token, spMatC);
883 auto a2 = genAllocBuffer(rewriter, loc, cTp.
getCrdType(), nnz, token);
884 colC = a2.getResult(0);
885 token = a2.getAsyncToken();
886 auto a3 = genAllocBuffer(rewriter, loc, dnCType, nnz, token);
887 valC = a3.getResult(0);
888 token = a3.getAsyncToken();
891 Operation *update = gpu::SetCsrPointersOp::create(
892 rewriter, loc, tokenTp, token, spMatC, rowC, colC, valC);
895 rewriter, loc, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
896 gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType);
897 token =
copy->getResult(0);
902 Value valH = genHostBuffer(rewriter, loc, dnCType, nnz);
905 token = gpu::SpGEMMDestroyDescrOp::create(rewriter, loc, tokenTp, token, desc)
907 token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatA)
909 token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatB)
911 token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatC)
913 token = genCopyMemRef(rewriter, loc, rowH, rowC, token);
914 token = genCopyMemRef(rewriter, loc, colH, colC, token);
915 token = genCopyMemRef(rewriter, loc, valH, valC, token);
916 token = genDeallocMemRef(rewriter, loc, rowA, token);
917 token = genDeallocMemRef(rewriter, loc, colA, token);
918 token = genDeallocMemRef(rewriter, loc, valA, token);
919 token = genDeallocMemRef(rewriter, loc, rowB, token);
920 token = genDeallocMemRef(rewriter, loc, colB, token);
921 token = genDeallocMemRef(rewriter, loc, valB, token);
922 token = genDeallocMemRef(rewriter, loc, rowC, token);
923 token = genDeallocMemRef(rewriter, loc, colC, token);
924 token = genDeallocMemRef(rewriter, loc, valC, token);
925 token = genDeallocMemRef(rewriter, loc, buffer1, token);
926 token = genDeallocMemRef(rewriter, loc, buffer2, token);
927 tokens.push_back(token);
928 genBlockingWait(rewriter, loc, tokens);
932 Value vt = bufferization::ToTensorOp::create(
934 Value rt = bufferization::ToTensorOp::create(
936 Value ct = bufferization::ToTensorOp::create(
945 linalg::GenericOp op) {
947 Value A = op.getOperand(0);
948 Value B = op.getOperand(1);
949 Value C = op.getOperand(2);
957 auto cnv =
A.getDefiningOp<ConvertOp>();
962 if (!isDenseTensor(A) || !isDenseTensor(B) || !isDenseTensor(C))
969 Value bufA = genTensorToMemref(rewriter, loc, A);
970 Value matA = genAllocCopy(rewriter, loc, bufA, tokens);
971 Value bufB = genTensorToMemref(rewriter, loc, B);
972 Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
973 Value bufC = genTensorToMemref(rewriter, loc, C);
974 Value matC = genAllocCopy(rewriter, loc, bufC, tokens);
975 genBlockingWait(rewriter, loc, tokens);
986 Value token = genFirstWait(rewriter, loc);
987 Operation *spGenA = gpu::Create2To4SpMatOp::create(
988 rewriter, loc, spMatHandleTp, tokenTp, token, szm, szk,
989 gpu::Prune2To4SpMatFlag::PRUNE_AND_CHECK, matA);
993 gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp, tokenTp,
995 Value dnB = dmatB.getResult(0);
996 token = dmatB.getAsyncToken();
998 gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp, tokenTp,
1000 Value dnC = dmatC.getResult(0);
1001 token = dmatC.getAsyncToken();
1002 auto dmatCType = llvm::cast<ShapedType>(matC.
getType()).getElementType();
1007 auto bufferComp = gpu::SpMMBufferSizeOp::create(
1008 rewriter, loc, bufferTypes, tokenTp, token,
1009 gpu::TransposeMode::NON_TRANSPOSE, gpu::TransposeMode::NON_TRANSPOSE,
1012 token = bufferComp.getAsyncToken();
1015 Value bufferSz1 = bufferComp.getResult(0);
1016 auto buf1 = genAllocBuffer(rewriter, loc, bufferSz1, token);
1017 Value buffer1 = buf1.getResult(0);
1018 token = buf1.getAsyncToken();
1019 Value bufferSz2 = bufferComp.getResult(1);
1020 auto buf2 = genAllocBuffer(rewriter, loc, bufferSz2, token);
1021 Value buffer2 = buf2.getResult(0);
1022 token = buf2.getAsyncToken();
1023 Value bufferSz3 = bufferComp.getResult(2);
1024 auto buf3 = genAllocBuffer(rewriter, loc, bufferSz3, token);
1025 Value buffer3 = buf3.getResult(0);
1026 token = buf3.getAsyncToken();
1029 auto dnCType = llvm::cast<ShapedType>(matC.
getType()).getElementType();
1030 auto spmmComp = gpu::SpMMOp::create(
1031 rewriter, loc, tokenTp, token, spMatA, dnB, dnC, dnCType,
1033 token = spmmComp.getAsyncToken();
1036 token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatA)
1038 token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnB)
1040 token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnC)
1042 token = genDeallocMemRef(rewriter, loc, buffer1, token);
1043 token = genDeallocMemRef(rewriter, loc, buffer2, token);
1044 token = genDeallocMemRef(rewriter, loc, buffer3, token);
1045 token = genDeallocMemRef(rewriter, loc, matA, token);
1046 token = genDeallocMemRef(rewriter, loc, matB, token);
1047 token = genCopyMemRef(rewriter, loc, bufC, matC, token);
1048 token = genDeallocMemRef(rewriter, loc, matC, token);
1049 tokens.push_back(token);
1050 genBlockingWait(rewriter, loc, tokens);
1060 linalg::GenericOp op,
bool enableRT) {
1062 Value a = op.getOperand(0);
1063 Value b = op.getOperand(1);
1064 Value c = op.getOperand(2);
1071 auto format = getCuSparseFormat(cTp, bTp, aTp, enableRT,
false);
1072 if (format == CuSparseFormat::kNone || format == CuSparseFormat::kCOO ||
1073 format == CuSparseFormat::kCSC)
1081 Value nseC = NumberOfEntriesOp::create(rewriter, loc, c);
1085 Value bufA = genTensorToMemref(rewriter, loc, a);
1086 Value matA = genAllocCopy(rewriter, loc, bufA, tokens);
1087 Value bufB = genTensorToMemref(rewriter, loc, b);
1088 Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
1089 Value memR = genFirstPosOrCrds(rewriter, loc, c, format, enableRT);
1090 Value memC = genSecondCrds(rewriter, loc, c, format, enableRT);
1091 Value memV = ToValuesOp::create(rewriter, loc, c);
1092 Value rowC = genAllocCopy(rewriter, loc, memR, tokens);
1093 Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) :
Value();
1094 Value valC = genAllocCopy(rewriter, loc, memV, tokens);
1095 genBlockingWait(rewriter, loc, tokens);
1103 Value token = genFirstWait(rewriter, loc);
1105 gpu::CreateDnTensorOp::create(rewriter, loc, dnMatHandleTp, tokenTp,
1107 Value dnA = dmatA.getResult(0);
1108 token = dmatA.getAsyncToken();
1110 gpu::CreateDnTensorOp::create(rewriter, loc, dnMatHandleTp, tokenTp,
1112 Value dnB = dmatB.getResult(0);
1113 token = dmatB.getAsyncToken();
1115 genSpMat(rewriter, loc, cTp, spMatHandleTp, tokenTp, token, szm, szn,
1116 nseC, rowC, colC, valC, format, enableRT);
1119 auto dnCType = llvm::cast<ShapedType>(c.
getType()).getElementType();
1122 auto bufferComp = gpu::SDDMMBufferSizeOp::create(
1123 rewriter, loc, indexTp, tokenTp, token, dnA, dnB, spMatC, dnCType);
1124 Value bufferSz = bufferComp.getResult(0);
1125 token = bufferComp.getAsyncToken();
1126 auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
1127 Value buffer = buf.getResult(0);
1128 token = buf.getAsyncToken();
1131 auto sddmmComp = gpu::SDDMMOp::create(rewriter, loc, tokenTp, token, dnA, dnB,
1132 spMatC, dnCType, buffer);
1133 token = sddmmComp.getAsyncToken();
1136 token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnA)
1138 token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnB)
1140 token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatC)
1142 token = genDeallocMemRef(rewriter, loc, buffer, token);
1143 token = genDeallocMemRef(rewriter, loc, matA, token);
1144 token = genDeallocMemRef(rewriter, loc, matB, token);
1145 token = genDeallocMemRef(rewriter, loc, rowC, token);
1147 token = genDeallocMemRef(rewriter, loc, colC, token);
1148 token = genCopyMemRef(rewriter, loc, memV, valC, token);
1149 token = genDeallocMemRef(rewriter, loc, valC, token);
1150 tokens.push_back(token);
1151 genBlockingWait(rewriter, loc, tokens);
1173 LogicalResult matchAndRewrite(scf::ParallelOp forallOp,
1181 forallOp.getNumReductions() != 0 || forallOp.getNumLoops() != 1 ||
1190 Value val = o.get();
1192 if (auto arg = dyn_cast<BlockArgument>(val))
1193 block = arg.getOwner();
1195 block = val.getDefiningOp()->getBlock();
1196 if (!forallOp.getRegion().findAncestorBlockInRegion(*block))
1197 invariants.insert(val);
1205 for (
Value val : invariants) {
1208 constants.push_back(val);
1210 scalars.push_back(val);
1211 else if (isa<MemRefType>(tp))
1212 buffers.push_back(val);
1223 Value out = genParametersIn(rewriter, loc, scalars, buffers, args, tokens,
1227 ModuleOp topModule = forallOp->getParentOfType<ModuleOp>();
1228 auto gpuModule = genGPUModule(rewriter, topModule);
1229 auto gpuFunc = genGPUFunc(rewriter, gpuModule, args);
1230 genGPUCode(rewriter, gpuFunc, forallOp, constants, scalars, buffers);
1236 genBlockingWait(rewriter, loc, tokens);
1239 genLaunchGPUFunc(rewriter, gpuFunc, args, tokens, numThreads);
1241 genParametersOut(rewriter, loc, out, kernelToken, scalars, buffers, args,
1243 genBlockingWait(rewriter, loc, tokens);
1249 unsigned numThreads;
1264 LogicalResult matchAndRewrite(linalg::GenericOp op,
1266 if (op.getNumDpsInits() != 1)
1269 const unsigned numLoops = op.getNumLoops();
1270 const unsigned numTensors = op->getNumOperands();
1271 const auto iteratorTypes = op.getIteratorTypesArray();
1275 auto infer = [&](MapList m) {
1285 if (numLoops == 2 && numTensors == 3 &&
1288 maps == infer({{i,
j}, {
j}, {i}}) && matchSumOfMultOfArgs(op)) {
1289 return rewriteSpMV(rewriter, op, enableRT);
1293 if (numLoops == 3 && numTensors == 3 &&
1297 maps == infer({{i, k}, {k,
j}, {i,
j}}) && matchSumOfMultOfArgs(op)) {
1298 if (!isDenseTensor(op.getOperand(0)) && !isDenseTensor(op.getOperand(1)))
1299 return rewriteSpGEMM(rewriter, op, enableRT);
1300 if (isConversionInto24(op.getOperand(0)))
1301 return rewrite2To4SpMM(rewriter, op);
1302 return rewriteSpMM(rewriter, op, enableRT);
1306 if (numLoops == 3 && numTensors == 3 &&
1310 maps == infer({{i, k}, {k,
j}, {i,
j}}) &&
1311 matchSumReductionOfMulUnary(op)) {
1312 return rewriteSDDMM(rewriter, op, enableRT);
1335 unsigned numThreads) {
static void copy(Location loc, Value dst, Value src, Value size, OpBuilder &builder)
Copies the given number of bytes from src to dst pointers.
static MLIRContext * getContext(OpFoldResult val)
Base type for affine expression.
static SmallVector< AffineMap, 4 > inferFromExprList(ArrayRef< ArrayRef< AffineExpr >> exprsList, MLIRContext *context)
Returns a vector of AffineMaps; each with as many results as exprs.size(), as many dims as the larges...
Block represents an ordered list of Operations.
BlockArgument getArgument(unsigned i)
BlockArgListType getArguments()
Ty getType(Args &&...args)
Get or construct an instance of the type Ty with provided arguments.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
This class helps build Operations.
InsertPoint saveInsertionPoint() const
Return a saved insertion point.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void restoreInsertionPoint(InsertPoint ip)
Restore the insert point to a previously saved point.
void cloneRegionBefore(Region ®ion, Region &parent, Region::iterator before, IRMapping &mapping)
Clone the blocks that belong to "region" before the given position in another region "parent".
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents an operand of an operation.
Operation is the basic unit of execution within MLIR.
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
MutableArrayRef< OpOperand > getOpOperands()
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
virtual void eraseBlock(Block *block)
This method erases all operations in a block.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
This class provides an abstraction over the various different ranges of value types.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
bool isIntOrIndex() const
Return true if this is an integer (of any signedness) or an index type.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName()
A wrapper around RankedTensorType, which has three goals:
bool isSingletonLvl(Level l) const
Type getElementType() const
unsigned getCrdWidth() const
Returns the coordinate-overhead bitwidth, defaulting to zero.
bool hasEncoding() const
Returns true for tensors which have an encoding, and false for those which do not.
Dimension getDimRank() const
Returns the dimension-rank.
bool isNOutOfMLvl(Level l) const
Type getCrdType() const
Returns the coordinate-overhead MLIR type, defaulting to IndexType.
bool isIdentity() const
Returns true if the dimToLvl mapping is the identity.
bool isCompressedLvl(Level l) const
Level getLvlRank() const
Returns the level-rank.
unsigned getPosWidth() const
Returns the position-overhead bitwidth, defaulting to zero.
bool isPermutation() const
Returns true if the dimToLvl mapping is a permutation.
bool isDenseLvl(Level l) const
AffineMap getDimToLvl() const
Returns the dimToLvl mapping (or the null-map for the identity).
Type getPosType() const
Returns the position-overhead MLIR type, defaulting to IndexType.
bool isOrderedLvl(Level l) const
bool isUniqueLvl(Level l) const
bool isParallelIterator(utils::IteratorType iteratorType)
Check if iterator type has "parallel" semantics.
bool isReductionIterator(utils::IteratorType iteratorType)
Check if iterator type has "reduction" semantics.
Value createOrFoldDimOp(OpBuilder &b, Location loc, Value val, int64_t dim)
Create one memref::DimOp or tensor::DimOp depending on the type of val.
Type getTensorTypeFromMemRefType(Type type)
Return an unranked/ranked tensor type for the given unranked/ranked memref type.
Value constantIndex(OpBuilder &builder, Location loc, int64_t i)
Generates a constant of index type.
SparseTensorType getSparseTensorType(Value val)
Convenience methods to obtain a SparseTensorType from a Value.
SmallVector< unsigned > getBlockSize(AffineMap dimToLvl)
Given the dimToLvl map, returns the block sizes in a vector.
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
void populateSparseGPULibgenPatterns(RewritePatternSet &patterns, bool enableRT)
detail::constant_int_predicate_matcher m_Zero()
Matches a constant scalar / vector splat / tensor splat integer zero.
const FrozenRewritePatternSet & patterns
detail::constant_int_predicate_matcher m_One()
Matches a constant scalar / vector splat / tensor splat integer one.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateSparseGPUCodegenPatterns(RewritePatternSet &patterns, unsigned numThreads)
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Utility class for the GPU dialect to represent triples of Values accessible through ....
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.