doxygen/SparseGPUCodegen_8cpp_source.html

 //===- SparseGPUCodegen.cpp - Generates GPU code --------------------------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This is a prototype GPU codegenerator for the sparsifier.

 // The objective is to eventually use the right combination of

 // direct code generation and libary calls into vendor-specific

 // highly optimized sparse libraries (e.g. cuSparse for CUDA).

 //

 //===----------------------------------------------------------------------===//


 #include "Utils/CodegenUtils.h"

 #include "Utils/LoopEmitter.h"


 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"

 #include "mlir/Dialect/GPU/IR/GPUDialect.h"

 #include "mlir/Dialect/Linalg/IR/Linalg.h"

 #include "mlir/Dialect/Linalg/Utils/Utils.h"

 #include "mlir/Dialect/MemRef/IR/MemRef.h"

 #include "mlir/Dialect/SCF/IR/SCF.h"

 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"

 #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"

 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"

 #include "mlir/IR/IRMapping.h"

 #include "mlir/IR/Matchers.h"


 using namespace mlir;

 using namespace mlir::sparse_tensor;


 namespace {


 // Sparse formats supported by cuSparse.

 enum class CuSparseFormat {

   kNone,

   kCOO,

   kCSR,

   kCSC,

   kBSR,

 };


 //===----------------------------------------------------------------------===//

 // Helper methods.

 //===----------------------------------------------------------------------===//


 /// Marks the given top module as a GPU container module.

 static void markAsGPUContainer(ModuleOp topModule) {

   topModule->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),

                      UnitAttr::get(topModule->getContext()));

 }


 /// Constructs a new GPU module (for GPU kernels) inside the given top module,

 /// or returns an existing GPU module if one was built previously.

 static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule) {

   for (auto op : topModule.getBodyRegion().getOps<gpu::GPUModuleOp>())

     return op; // existing

   markAsGPUContainer(topModule);

   builder.setInsertionPointToStart(topModule.getBody());

   return builder.create<gpu::GPUModuleOp>(topModule->getLoc(),

                                           "sparse_kernels");

 }


 /// Constructs a new GPU kernel in the given GPU module.

 static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule,

                                  SmallVectorImpl<Value> &args) {

   // Get a unique kernel name. Not very creative,

   // but we simply try kernel0, kernel1, etc.

   unsigned kernelNumber = 0;

   SmallString<16> kernelName;

   do {

     kernelName.clear();

     ("kernel" + Twine(kernelNumber++)).toStringRef(kernelName);

   } while (gpuModule.lookupSymbol(kernelName));

   // Then we insert a new kernel with given arguments into the module.

   builder.setInsertionPointToStart(gpuModule.getBody());

   SmallVector<Type> argsTp;

   for (auto arg : args)

     argsTp.push_back(arg.getType());

   FunctionType type = FunctionType::get(gpuModule->getContext(), argsTp, {});

   auto gpuFunc =

       builder.create<gpu::GPUFuncOp>(gpuModule->getLoc(), kernelName, type);

   gpuFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),

                    builder.getUnitAttr());

   return gpuFunc;

 }


 /// Constructs code to launch GPU kernel.

 static Value genLaunchGPUFunc(OpBuilder &builder, gpu::GPUFuncOp gpuFunc,

                               SmallVectorImpl<Value> &args,

                               SmallVectorImpl<Value> &tokens,

                               unsigned numThreads) {

   Location loc = gpuFunc->getLoc();

   Value none = TypedValue<::mlir::IntegerType>{};

   Value one = constantIndex(builder, loc, 1);

   Value numT = constantIndex(builder, loc, numThreads);

   gpu::KernelDim3 gridSize = {one, one, one};

   gpu::KernelDim3 blckSize = {numT, one, one};

   return builder

       .create<gpu::LaunchFuncOp>(loc, gpuFunc, gridSize, blckSize,

                                  /*dynSharedMemSz*/ none, args,

                                  builder.getType<gpu::AsyncTokenType>(), tokens)

       .getAsyncToken();

 }


 /// Maps the provided ranked host buffer into the device address space.

 /// Writes from the host are guaranteed to be visible to device kernels

 /// that are launched afterwards. Writes from the device are guaranteed

 /// to be visible on the host after synchronizing with the device kernel

 /// completion. Needs to cast the buffer to a unranked buffer.

 static Value genHostRegisterMemref(OpBuilder &builder, Location loc,

                                    Value mem) {

   MemRefType memTp = cast<MemRefType>(mem.getType());

   UnrankedMemRefType resTp =

       UnrankedMemRefType::get(memTp.getElementType(), /*memorySpace=*/0);

   Value cast = builder.create<memref::CastOp>(loc, resTp, mem);

   builder.create<gpu::HostRegisterOp>(loc, cast);

   return cast;

 }


 /// Unmaps the provided buffer, expecting the casted buffer.

 static void genHostUnregisterMemref(OpBuilder &builder, Location loc,

                                     Value cast) {

   builder.create<gpu::HostUnregisterOp>(loc, cast);

 }


 /// Generates first wait in an asynchronous chain.

 static Value genFirstWait(OpBuilder &builder, Location loc) {

   Type tokenType = builder.getType<gpu::AsyncTokenType>();

   return builder.create<gpu::WaitOp>(loc, tokenType, ValueRange())

       .getAsyncToken();

 }


 /// Generates last, blocking wait in an asynchronous chain.

 static void genBlockingWait(OpBuilder &builder, Location loc,

                             ValueRange operands) {

   builder.create<gpu::WaitOp>(loc, Type(), operands);

 }


 /// Allocates memory on the device.

 /// TODO: A `host_shared` attribute could be used to indicate that

 ///       the buffer is visible by both host and device, but lowering

 ///       that feature does not seem to be fully supported yet.

 static gpu::AllocOp genAllocMemRef(OpBuilder &builder, Location loc, Value mem,

                                    Value token) {

   auto tp = cast<ShapedType>(mem.getType());

   auto elemTp = tp.getElementType();

   auto shape = tp.getShape();

   auto memTp = MemRefType::get(shape, elemTp);

   SmallVector<Value> dynamicSizes;

   for (unsigned r = 0, rank = tp.getRank(); r < rank; r++) {

     if (shape[r] == ShapedType::kDynamic) {

       Value dimOp = linalg::createOrFoldDimOp(builder, loc, mem, r);

       dynamicSizes.push_back(dimOp);

     }

   }

   return builder.create<gpu::AllocOp>(loc, TypeRange({memTp, token.getType()}),

                                       token, dynamicSizes, ValueRange());

 }


 // Allocates a typed buffer on the host with given size.

 static Value genHostBuffer(OpBuilder &builder, Location loc, Type type,

                            Value size) {

   const auto memTp = MemRefType::get({ShapedType::kDynamic}, type);

   return builder.create<memref::AllocOp>(loc, memTp, size).getResult();

 }


 // Allocates a typed buffer on the device with given size.

 static gpu::AllocOp genAllocBuffer(OpBuilder &builder, Location loc, Type type,

                                    Value size, Value token) {

   const auto memTp = MemRefType::get({ShapedType::kDynamic}, type);

   return builder.create<gpu::AllocOp>(loc, TypeRange({memTp, token.getType()}),

                                       token, size, ValueRange());

 }


 // Allocates a void buffer on the device with given size.

 static gpu::AllocOp genAllocBuffer(OpBuilder &builder, Location loc, Value size,

                                    Value token) {

   return genAllocBuffer(builder, loc, builder.getI8Type(), size, token);

 }


 /// Deallocates memory from the device.

 static Value genDeallocMemRef(OpBuilder &builder, Location loc, Value mem,

                               Value token) {

   return builder.create<gpu::DeallocOp>(loc, token.getType(), token, mem)

       .getAsyncToken();

 }


 /// Copies memory between host and device (direction is implicit).

 static Value genCopyMemRef(OpBuilder &builder, Location loc, Value dst,

                            Value src, Value token) {

   return builder.create<gpu::MemcpyOp>(loc, token.getType(), token, dst, src)

       .getAsyncToken();

 }


 /// Generates an alloc/copy pair.

 static Value genAllocCopy(OpBuilder &builder, Location loc, Value b,

                           SmallVectorImpl<Value> &tokens) {

   Value firstToken = genFirstWait(builder, loc);

   auto alloc = genAllocMemRef(builder, loc, b, firstToken);

   Value devMem = alloc.getResult(0);

   Value depToken = alloc.getAsyncToken(); // copy-after-alloc

   tokens.push_back(genCopyMemRef(builder, loc, devMem, b, depToken));

   return devMem;

 }


 /// Generates a memref from tensor operation.

 static Value genTensorToMemref(PatternRewriter &rewriter, Location loc,

                                Value tensor) {

   auto tensorType = llvm::cast<ShapedType>(tensor.getType());

   auto memrefType =

       MemRefType::get(tensorType.getShape(), tensorType.getElementType());

   return rewriter.create<bufferization::ToBufferOp>(loc, memrefType, tensor);

 }


 /// Prepares the outlined arguments, passing scalars and buffers in. Here we

 /// assume that the first buffer is the one allocated for output. We create

 /// a set of properly chained asynchronous allocation/copy pairs to increase

 /// overlap before launching the kernel.

 static Value genParametersIn(OpBuilder &builder, Location loc,

                              SmallVectorImpl<Value> &scalars,

                              SmallVectorImpl<Value> &buffers,

                              SmallVectorImpl<Value> &args,

                              SmallVectorImpl<Value> &tokens,

                              bool useHostRegistrationForOut) {

   Value out;

   // Scalars are passed by value.

   for (Value s : scalars)

     args.push_back(s);

   // Buffers are need to be made visible on device.

   for (Value b : buffers) {

     if (useHostRegistrationForOut) {

       out = genHostRegisterMemref(builder, loc, b);

       args.push_back(b);

       useHostRegistrationForOut = false;

       continue;

     }

     args.push_back(genAllocCopy(builder, loc, b, tokens));

   }

   return out;

 }


 /// Finalizes the outlined arguments. The output buffer is copied depending

 /// on the kernel token and then deallocated. All other buffers are simply

 /// deallocated. Then we wait for all operations to complete.

 static void genParametersOut(OpBuilder &builder, Location loc, Value out,

                              Value kernelToken, SmallVectorImpl<Value> &scalars,

                              SmallVectorImpl<Value> &buffers,

                              SmallVectorImpl<Value> &args,

                              SmallVectorImpl<Value> &tokens) {

   unsigned base = scalars.size();

   for (unsigned i = base, e = args.size(); i < e; i++) {

     Value firstToken;

     if (i == base) {

       // Assumed output parameter: unregister or copy-out.

       if (out) {

         genHostUnregisterMemref(builder, loc, out);

         out = Value();

         continue;

       }

       firstToken =

           genCopyMemRef(builder, loc, buffers[0], args[i], kernelToken);

     } else {

       firstToken = genFirstWait(builder, loc);

     }

     tokens.push_back(genDeallocMemRef(builder, loc, args[i], firstToken));

   }

 }


 /// Constructs code for new GPU kernel.

 static void genGPUCode(PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc,

                        scf::ParallelOp forallOp,

                        SmallVectorImpl<Value> &constants,

                        SmallVectorImpl<Value> &scalars,

                        SmallVectorImpl<Value> &buffers) {

   Location loc = gpuFunc->getLoc();

   Block &block = gpuFunc.getBody().front();

   rewriter.setInsertionPointToStart(&block);


   // Re-generate the constants, recapture all arguments.

   unsigned arg = 0;

   IRMapping irMap;

   for (Value c : constants)

     irMap.map(c, rewriter.clone(*c.getDefiningOp())->getResult(0));

   for (Value s : scalars)

     irMap.map(s, block.getArgument(arg++));

   for (Value b : buffers)

     irMap.map(b, block.getArgument(arg++));


   // Assume 1-dimensional grid/block configuration (only x dimension),

   // so that:

   //   row = blockIdx.x * blockDim.x + threadIdx.x

   //   inc = blockDim.x * gridDim.x

   Value bid = rewriter.create<gpu::BlockIdOp>(loc, gpu::Dimension::x);

   Value bsz = rewriter.create<gpu::BlockDimOp>(loc, gpu::Dimension::x);

   Value tid = rewriter.create<gpu::ThreadIdOp>(loc, gpu::Dimension::x);

   Value gsz = rewriter.create<gpu::GridDimOp>(loc, gpu::Dimension::x);

   Value mul = rewriter.create<arith::MulIOp>(loc, bid, bsz);

   Value row = rewriter.create<arith::AddIOp>(loc, mul, tid);

   Value inc = rewriter.create<arith::MulIOp>(loc, bsz, gsz);


   // Construct the iteration over the computational space that

   // accounts for the fact that the total number of threads and

   // the amount of work to be done usually do not match precisely.

   //   for (r = row; r < N; r += inc) {

   //     <loop-body>

   //   }

   Value upper = irMap.lookup(forallOp.getUpperBound()[0]);

   scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, row, upper, inc);

   // The scf.for builder creates an empty block. scf.for does not allow multiple

   // blocks in its region, so delete the block before `cloneRegionBefore` adds

   // an additional block.

   rewriter.eraseBlock(forOp.getBody());

   rewriter.cloneRegionBefore(forallOp.getRegion(), forOp.getRegion(),

                              forOp.getRegion().begin(), irMap);

   // Replace the scf.reduce terminator.

   rewriter.setInsertionPoint(forOp.getBody()->getTerminator());

   rewriter.replaceOpWithNewOp<scf::YieldOp>(forOp.getBody()->getTerminator());


   // Done.

   rewriter.setInsertionPointAfter(forOp);

   rewriter.create<gpu::ReturnOp>(gpuFunc->getLoc());

 }


 //===----------------------------------------------------------------------===//

 // Library helper methods.

 //===----------------------------------------------------------------------===//


 /// Helper to detect a + b with arguments taken from given block.

 static bool matchAddOfArgs(Block *block, Value val) {

   if (auto *def = val.getDefiningOp()) {

     if (isa<arith::AddFOp, arith::AddIOp>(def)) {

       Value a = block->getArguments()[0];

       Value b = block->getArguments()[1];

       return (def->getOperand(0) == a && def->getOperand(1) == b) ||

              (def->getOperand(0) == b && def->getOperand(1) == a);

     }

   }

   return false;

 }


 /// Helper to detect a * b with arguments taken from given block.

 static bool matchMulOfArgs(Block *block, Value val) {

   if (auto *def = val.getDefiningOp()) {

     if (isa<arith::MulFOp, arith::MulIOp>(def)) {

       Value a = block->getArguments()[0];

       Value b = block->getArguments()[1];

       return (def->getOperand(0) == a && def->getOperand(1) == b) ||

              (def->getOperand(0) == b && def->getOperand(1) == a);

     }

   }

   return false;

 }


 /// Helper to detect x = x + a * b

 static bool matchSumOfMultOfArgs(linalg::GenericOp op) {

   auto yieldOp = cast<linalg::YieldOp>(op.getRegion().front().getTerminator());

   if (auto *def = yieldOp.getOperand(0).getDefiningOp()) {

     if (isa<arith::AddFOp, arith::AddIOp>(def)) {

       Value x = op.getBlock()->getArguments()[2];

       return (def->getOperand(0) == x &&

               matchMulOfArgs(op.getBlock(), def->getOperand(1))) ||

              (def->getOperand(1) == x &&

               matchMulOfArgs(op.getBlock(), def->getOperand(0)));

     }

   }

   return false;

 }


 // Helper to detect c += spy(s) x (a * b)

 static bool matchSumReductionOfMulUnary(linalg::GenericOp op) {

   auto yieldOp = cast<linalg::YieldOp>(op.getRegion().front().getTerminator());

   // The linalg yields a custom reduce result.

   Value s_out = op.getBlock()->getArguments()[2];

   if (auto redOp =

           yieldOp.getOperand(0).getDefiningOp<sparse_tensor::ReduceOp>()) {

     // The reduce consumes the output.

     Value other;

     if (s_out == redOp->getOperand(0))

       other = redOp->getOperand(1);

     else if (s_out == redOp->getOperand(1))

       other = redOp->getOperand(0);

     else

       return false;

     // The reduce op also consumes an unary which also consumes the output

     // and does not define an absent value.

     if (auto unOp = other.getDefiningOp<sparse_tensor::UnaryOp>()) {

       if (s_out != unOp->getOperand(0) || !unOp.getAbsentRegion().empty())

         return false;

       // And the bodies are as expected.

       auto yieldUn = cast<sparse_tensor::YieldOp>(

           unOp.getRegion(0).front().getTerminator());

       auto yieldRed = cast<sparse_tensor::YieldOp>(

           redOp.getRegion().front().getTerminator());

       return matchMulOfArgs(op.getBlock(), yieldUn.getOperand(0)) &&

              matchAddOfArgs(&redOp.getRegion().front(), yieldRed.getOperand(0));

     }

   }

   return false;

 }


 /// Test for dense tensor.

 static bool isDenseTensor(Value v) {

   auto sTp = getSparseTensorType(v);

   return sTp.getDimRank() == sTp.getLvlRank() && sTp.isAllDense();

 }


 /// Test for suitable positions/coordinates width.

 static bool isAdmissibleMetaData(SparseTensorType &aTp) {

   return (aTp.getPosWidth() == 0 || aTp.getPosWidth() >= 16) &&

          (aTp.getCrdWidth() == 0 || aTp.getCrdWidth() >= 16);

 }


 /// Test for sorted COO matrix with suitable metadata.

 static bool isAdmissibleCOO(SparseTensorType &aTp) {

   return aTp.getDimRank() == 2 && aTp.getLvlRank() == 2 && aTp.isIdentity() &&

          aTp.isCompressedLvl(0) && aTp.isOrderedLvl(0) && !aTp.isUniqueLvl(0) &&

          aTp.isSingletonLvl(1) && aTp.isOrderedLvl(1) && aTp.isUniqueLvl(1) &&

          isAdmissibleMetaData(aTp);

 }


 /// Test for CSR matrix with suitable metadata.

 static bool isAdmissibleCSR(SparseTensorType &aTp) {

   return aTp.getDimRank() == 2 && aTp.getLvlRank() == 2 && aTp.isIdentity() &&

          aTp.isDenseLvl(0) && aTp.isCompressedLvl(1) && aTp.isOrderedLvl(1) &&

          aTp.isUniqueLvl(1) && isAdmissibleMetaData(aTp);

 }


 /// Test for CSC matrix with suitable metadata.

 static bool isAdmissibleCSC(SparseTensorType &aTp) {

   return aTp.getDimRank() == 2 && aTp.getLvlRank() == 2 && !aTp.isIdentity() &&

          aTp.isPermutation() && aTp.isDenseLvl(0) && aTp.isCompressedLvl(1) &&

          aTp.isOrderedLvl(1) && aTp.isUniqueLvl(1) && isAdmissibleMetaData(aTp);

 }


 /// Test for BSR matrix with suitable metadata.

 static bool isAdmissibleBSR(SparseTensorType &aTp) {

   if (aTp.getDimRank() == 2 && aTp.getLvlRank() == 4 && aTp.isDenseLvl(0) &&

       aTp.isCompressedLvl(1) && aTp.isOrderedLvl(1) && aTp.isUniqueLvl(1) &&

       aTp.isDenseLvl(2) && aTp.isDenseLvl(3) && isAdmissibleMetaData(aTp)) {

     // CuSparse only supports "square" blocks currently.

     SmallVector<unsigned> dims = getBlockSize(aTp.getDimToLvl());

     assert(dims.size() == 2);

     return dims[0] == dims[1] && dims[0] > 1;

   }

   return false;

 }


 /// Test for 2:4 matrix with suitable metadata.

 static bool isAdmissible24(SparseTensorType &aTp) {

   return aTp.getDimRank() == 2 && aTp.getLvlRank() == 3 && aTp.isDenseLvl(0) &&

          aTp.isDenseLvl(1) && aTp.isNOutOfMLvl(2) && isAdmissibleMetaData(aTp);

 }


 /// Test for conversion into 2:4 matrix.

 static bool isConversionInto24(Value v) {

   if (auto cnv = v.getDefiningOp<ConvertOp>()) {

     Value a = cnv.getResult();

     Value d = cnv.getSource();

     SparseTensorType aTp = getSparseTensorType(a);

     return isDenseTensor(d) && isAdmissible24(aTp);

   }

   return false;

 }


 /// Returns a suitable sparse format for the operation and given operand

 /// types with cuSparse, or kNone if none is available.

 static CuSparseFormat getCuSparseFormat(SparseTensorType aTp,

                                         SparseTensorType bTp,

                                         SparseTensorType cTp, bool enableRT,

                                         bool isMatVec) {

   // The other operands have a dense type.

   if (bTp.hasEncoding() || cTp.hasEncoding())

     return CuSparseFormat::kNone;

   // Now check for suitable operand type for the main operand.

   if (isAdmissibleCOO(aTp))

 #ifdef CUSPARSE_COO_AOS

     return isMatVec ? CuSparseFormat::kCOO : CuSparseFormat::kNone;

 #else

     return enableRT ? CuSparseFormat::kCOO : CuSparseFormat::kNone;

 #endif

   if (isAdmissibleCSR(aTp))

     return CuSparseFormat::kCSR;

   if (isAdmissibleCSC(aTp))

     return CuSparseFormat::kCSC;

   if (isAdmissibleBSR(aTp))

     return CuSparseFormat::kBSR;

   return CuSparseFormat::kNone;

 }


 /// Generates the first positions/coordinates of a sparse matrix.

 static Value genFirstPosOrCrds(OpBuilder &builder, Location loc, Value a,

                                CuSparseFormat format, bool enableRT) {

   if (format == CuSparseFormat::kCOO) {

     // Library uses SoA COO, direct IR uses AoS COO.

     if (enableRT)

       return builder.create<ToCoordinatesOp>(loc, a, 0);

     return builder.create<ToCoordinatesBufferOp>(loc, a);

   }

   // Formats CSR/CSC and BSR use positions at 1.

   return builder.create<ToPositionsOp>(loc, a, 1);

 }


 /// Generates the second coordinates of a sparse matrix.

 static Value genSecondCrds(OpBuilder &builder, Location loc, Value a,

                            CuSparseFormat format, bool enableRT) {

   bool isCOO = format == CuSparseFormat::kCOO;

   if (isCOO && !enableRT)

     return Value(); // nothing needed

   // Formats CSR/CSC and BSR use coordinates at 1.

   return builder.create<ToCoordinatesOp>(loc, a, 1);

 }


 /// Generates the sparse matrix handle.

 static Operation *genSpMat(OpBuilder &builder, Location loc,

                            SparseTensorType &aTp, Type handleTp, Type tokenTp,

                            Value token, Value sz1, Value sz2, Value nseA,

                            Value rowA, Value colA, Value valA,

                            CuSparseFormat format, bool enableRT) {

   if (format == CuSparseFormat::kCOO) {

     // Library uses SoA COO, direct IR uses AoS COO.

     if (enableRT) {

       assert(colA);

       return builder.create<gpu::CreateCooOp>(loc, handleTp, tokenTp, token,

                                               sz1, sz2, nseA, rowA, colA, valA);

     }

 #ifdef CUSPARSE_COO_AOS

     assert(!colA);

     return builder.create<gpu::CreateCooAoSOp>(loc, handleTp, tokenTp, token,

                                                sz1, sz2, nseA, rowA, valA);

 #else

     llvm_unreachable("gpu::CreateCooAoSOp is deprecated");

 #endif

   }

   assert(colA);

   if (format == CuSparseFormat::kCSR)

     return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, sz1,

                                             sz2, nseA, rowA, colA, valA);

   if (format == CuSparseFormat::kCSC)

     return builder.create<gpu::CreateCscOp>(loc, handleTp, tokenTp, token, sz1,

                                             sz2, nseA, rowA, colA, valA);

   // BSR requires a bit more work since we need to pass in the block size

   // and all others sizes in terms of blocks (#block-rows, #block-cols,

   // #nonzero-blocks).

   assert(format == CuSparseFormat::kBSR);

   SmallVector<unsigned> dims = getBlockSize(aTp.getDimToLvl());

   assert(dims.size() == 2 && dims[0] == dims[1]);

   uint64_t b = dims[0];

   Value bSz = constantIndex(builder, loc, b);

   Value bRows = builder.create<arith::DivUIOp>(loc, sz1, bSz);

   Value bCols = builder.create<arith::DivUIOp>(loc, sz2, bSz);

   Value bNum = builder.create<arith::DivUIOp>(

       loc, nseA, constantIndex(builder, loc, b * b));

   return builder.create<gpu::CreateBsrOp>(loc, handleTp, tokenTp, token, bRows,

                                           bCols, bNum, bSz, bSz, rowA, colA,

                                           valA);

 }


 /// Match and rewrite SpMV kernel.

 static LogicalResult rewriteSpMV(PatternRewriter &rewriter,

                                  linalg::GenericOp op, bool enableRT) {

   Location loc = op.getLoc();

   Value a = op.getOperand(0);

   Value x = op.getOperand(1);

   Value y = op.getOperand(2); // we have y = Ax

   SmallVector<Value> tokens;


   // Only admissible sparse matrix format and dense vectors (no BSR).

   SparseTensorType aTp = getSparseTensorType(a);

   SparseTensorType xTp = getSparseTensorType(x);

   SparseTensorType yTp = getSparseTensorType(y);

   auto format = getCuSparseFormat(aTp, xTp, yTp, enableRT, /*isMatVec=*/true);

   if (format == CuSparseFormat::kNone || format == CuSparseFormat::kBSR)

     return failure();


   // Start sparse kernel and copy data from host to device.

   //   a : memR/memC/memV -> rowA,colA,valA

   //   x : memX           -> vecX

   //   y : memY           -> vecY

   Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);

   Value szY = linalg::createOrFoldDimOp(rewriter, loc, a, 0);

   Value szX = linalg::createOrFoldDimOp(rewriter, loc, a, 1);

   Value memR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);

   Value memC = genSecondCrds(rewriter, loc, a, format, enableRT); // or empty

   Value memV = rewriter.create<ToValuesOp>(loc, a);

   Value rowA = genAllocCopy(rewriter, loc, memR, tokens);

   Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();

   Value valA = genAllocCopy(rewriter, loc, memV, tokens);

   Value memX = genTensorToMemref(rewriter, loc, x);

   Value vecX = genAllocCopy(rewriter, loc, memX, tokens);

   Value memY = genTensorToMemref(rewriter, loc, y);

   Value vecY = genAllocCopy(rewriter, loc, memY, tokens);

   genBlockingWait(rewriter, loc, tokens);

   tokens.clear();


   // Create sparse environment and sparse matrix/dense vector handles.

   Type indexTp = rewriter.getIndexType();

   Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();

   Type spmatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();

   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();

   Value token = genFirstWait(rewriter, loc);

   Operation *spGenA =

       genSpMat(rewriter, loc, aTp, spmatHandleTp, tokenTp, token, szY, szX,

                nseA, rowA, colA, valA, format, enableRT);

   Value spMatA = spGenA->getResult(0);

   token = spGenA->getResult(1);

   auto dvecX = rewriter.create<gpu::CreateDnTensorOp>(

       loc, dnTensorHandleTp, tokenTp, token, vecX, szX);

   Value dnX = dvecX.getResult(0);

   token = dvecX.getAsyncToken();

   auto dvecY = rewriter.create<gpu::CreateDnTensorOp>(

       loc, dnTensorHandleTp, tokenTp, token, vecY, szY);

   Value dnY = dvecY.getResult(0);

   token = dvecY.getAsyncToken();

   auto dnYType = llvm::cast<ShapedType>(y.getType()).getElementType();


   // Precompute buffersize for SpMV.

   auto bufferComp = rewriter.create<gpu::SpMVBufferSizeOp>(

       loc, indexTp, tokenTp, token, spMatA, dnX, dnY,

       /*computeType=*/dnYType);

   Value bufferSz = bufferComp.getResult(0);

   token = bufferComp.getAsyncToken();

   auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);

   Value buffer = buf.getResult(0);

   token = buf.getAsyncToken();


   // Perform the SpMV.

   auto spmvComp = rewriter.create<gpu::SpMVOp>(

       loc, tokenTp, token, spMatA, dnX, dnY, /*computeType=*/dnYType, buffer);

   token = spmvComp.getAsyncToken();


   // Copy data back to host and free all the resoures.

   token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)

               .getAsyncToken();

   token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnX)

               .getAsyncToken();

   token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnY)

               .getAsyncToken();

   token = genDeallocMemRef(rewriter, loc, rowA, token);

   if (colA)

     token = genDeallocMemRef(rewriter, loc, colA, token);

   token = genDeallocMemRef(rewriter, loc, valA, token);

   token = genDeallocMemRef(rewriter, loc, buffer, token);

   token = genDeallocMemRef(rewriter, loc, vecX, token);

   token = genCopyMemRef(rewriter, loc, memY, vecY, token);

   token = genDeallocMemRef(rewriter, loc, vecY, token);

   tokens.push_back(token);

   genBlockingWait(rewriter, loc, tokens);

   tokens.clear();


   // Done.

   rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, memY);

   return success();

 }


 /// Match and rewrite SpMM kernel.

 static LogicalResult rewriteSpMM(PatternRewriter &rewriter,

                                  linalg::GenericOp op, bool enableRT) {

   Location loc = op.getLoc();

   Value a = op.getOperand(0);

   Value b = op.getOperand(1);

   Value c = op.getOperand(2); // we have C = AB

   SmallVector<Value> tokens;


   // Only admissible sparse matrix format and dense matrices (no BSR).

   SparseTensorType aTp = getSparseTensorType(a);

   SparseTensorType bTp = getSparseTensorType(b);

   SparseTensorType cTp = getSparseTensorType(c);

   auto format = getCuSparseFormat(aTp, bTp, cTp, enableRT, /*isMatVec=*/false);

   if (format == CuSparseFormat::kNone || format == CuSparseFormat::kBSR)

     return failure();


   // Start sparse kernel and copy data from host to device.

   //   a : memR/memC/memV -> rowA,colA,valA

   //   b : bufB           -> matB

   //   c : bufC           -> matC

   Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);

   Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);

   Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);

   Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);

   Value memR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);

   Value memC = genSecondCrds(rewriter, loc, a, format, enableRT); // or empty

   Value memV = rewriter.create<ToValuesOp>(loc, a);

   Value rowA = genAllocCopy(rewriter, loc, memR, tokens);

   Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();

   Value valA = genAllocCopy(rewriter, loc, memV, tokens);

   Value bufB = genTensorToMemref(rewriter, loc, b);

   Value matB = genAllocCopy(rewriter, loc, bufB, tokens);

   Value bufC = genTensorToMemref(rewriter, loc, c);

   Value matC = genAllocCopy(rewriter, loc, bufC, tokens);

   genBlockingWait(rewriter, loc, tokens);

   tokens.clear();


   // Create sparse environment and sparse matrix/dense matrix handles.

   Type indexTp = rewriter.getIndexType();

   Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();

   Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();

   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();

   Value token = genFirstWait(rewriter, loc);

   Operation *spGenA =

       genSpMat(rewriter, loc, aTp, spMatHandleTp, tokenTp, token, szm, szk,

                nseA, rowA, colA, valA, format, enableRT);

   Value spMatA = spGenA->getResult(0);

   token = spGenA->getResult(1);

   auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(

       loc, dnTensorHandleTp, tokenTp, token, matB,

       SmallVector<Value>{szk, szn});

   Value dnB = dmatB.getResult(0);

   token = dmatB.getAsyncToken();

   auto dmatC = rewriter.create<gpu::CreateDnTensorOp>(

       loc, dnTensorHandleTp, tokenTp, token, matC,

       SmallVector<Value>{szm, szn});

   Value dnC = dmatC.getResult(0);

   token = dmatC.getAsyncToken();

   auto dmatCType = llvm::cast<ShapedType>(c.getType()).getElementType();


   // Precompute buffersize for SpMM.

   auto bufferComp = rewriter.create<gpu::SpMMBufferSizeOp>(

       loc, indexTp, tokenTp, token, spMatA, dnB, dnC,

       /*computeType=*/dmatCType);

   Value bufferSz = bufferComp.getResult(0);

   token = bufferComp.getAsyncToken();

   auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);

   Value buffer = buf.getResult(0);

   token = buf.getAsyncToken();

   auto dnCType = llvm::cast<ShapedType>(c.getType()).getElementType();


   // Perform the SpMM.

   auto spmmComp = rewriter.create<gpu::SpMMOp>(

       loc, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dnCType, buffer);

   token = spmmComp.getAsyncToken();


   // Copy data back to host and free all the resoures.

   token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)

               .getAsyncToken();

   token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)

               .getAsyncToken();

   token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)

               .getAsyncToken();

   token = genDeallocMemRef(rewriter, loc, rowA, token);

   if (colA)

     token = genDeallocMemRef(rewriter, loc, colA, token);

   token = genDeallocMemRef(rewriter, loc, valA, token);

   token = genDeallocMemRef(rewriter, loc, buffer, token);

   token = genDeallocMemRef(rewriter, loc, matB, token);

   token = genCopyMemRef(rewriter, loc, bufC, matC, token);

   token = genDeallocMemRef(rewriter, loc, matC, token);

   tokens.push_back(token);

   genBlockingWait(rewriter, loc, tokens);

   tokens.clear();


   // Done.

   rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, bufC);

   return success();

 }


 // Match and rewrite SpGEMM kernel.

 static LogicalResult rewriteSpGEMM(PatternRewriter &rewriter,

                                    linalg::GenericOp op, bool enableRT) {

   Location loc = op.getLoc();

   Value a = op.getOperand(0);

   Value b = op.getOperand(1);

   Value c = op.getOperand(2); // we have C = AB

   SmallVector<Value> tokens;


   // Only CSR <- CSR x CSR supported.

   auto format = CuSparseFormat::kCSR;

   SparseTensorType aTp = getSparseTensorType(a);

   SparseTensorType bTp = getSparseTensorType(b);

   SparseTensorType cTp = getSparseTensorType(c);

   if (!isAdmissibleCSR(aTp) || !isAdmissibleCSR(bTp) || !isAdmissibleCSR(cTp))

     return failure();


   // Start sparse kernel and copy data from host to device.

   //   a : amemR/amemC/amemV -> rowA,colA,valA

   //   b : bmemR/bmemC/bmemV -> rowB,colB,valB

   //   c : materializes

   auto dnCType = cTp.getElementType();

   Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);

   Value nseB = rewriter.create<NumberOfEntriesOp>(loc, b);

   Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);

   Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);

   Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);

   Value amemR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);

   Value amemC = genSecondCrds(rewriter, loc, a, format, enableRT); // not empty

   Value amemV = rewriter.create<ToValuesOp>(loc, a);

   Value bmemR = genFirstPosOrCrds(rewriter, loc, b, format, enableRT);

   Value bmemC = genSecondCrds(rewriter, loc, b, format, enableRT); // not empty

   Value bmemV = rewriter.create<ToValuesOp>(loc, b);

   Value rowA = genAllocCopy(rewriter, loc, amemR, tokens);

   Value colA = genAllocCopy(rewriter, loc, amemC, tokens);

   Value valA = genAllocCopy(rewriter, loc, amemV, tokens);

   Value rowB = genAllocCopy(rewriter, loc, bmemR, tokens);

   Value colB = genAllocCopy(rewriter, loc, bmemC, tokens);

   Value valB = genAllocCopy(rewriter, loc, bmemV, tokens);

   genBlockingWait(rewriter, loc, tokens);

   tokens.clear();


   // Create sparse environment and sparse matrix/dense vector handles.

   Type indexTp = rewriter.getIndexType();

   Type spmatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();

   Type descTp = rewriter.getType<gpu::SparseSpGEMMOpHandleType>();

   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();

   Value token = genFirstWait(rewriter, loc);

   Operation *spGenA =

       genSpMat(rewriter, loc, aTp, spmatHandleTp, tokenTp, token, szm, szk,

                nseA, rowA, colA, valA, format, enableRT);

   Value spMatA = spGenA->getResult(0);

   token = spGenA->getResult(1);

   Operation *spGenB =

       genSpMat(rewriter, loc, bTp, spmatHandleTp, tokenTp, token, szk, szn,

                nseB, rowB, colB, valB, format, enableRT);

   Value spMatB = spGenB->getResult(0);

   token = spGenB->getResult(1);


   // Sparse matrix C materializes (also assumes beta == 0).

   Value zero = constantIndex(rewriter, loc, 0);

   Value one = constantIndex(rewriter, loc, 1);

   Value mplus1 = rewriter.create<arith::AddIOp>(loc, szm, one);

   auto e1 = genAllocBuffer(rewriter, loc, cTp.getPosType(), mplus1, token);

   Value rowC = e1.getResult(0);

   token = e1.getAsyncToken();

   auto e2 = genAllocBuffer(rewriter, loc, cTp.getCrdType(), zero, token);

   Value colC = e2.getResult(0); // no free needed

   token = e2.getAsyncToken();

   auto e3 = genAllocBuffer(rewriter, loc, dnCType, zero, token);

   Value valC = e3.getResult(0); // no free needed

   token = e3.getAsyncToken();

   Operation *spGenC =

       genSpMat(rewriter, loc, cTp, spmatHandleTp, tokenTp, token, szm, szn,

                zero, rowC, colC, valC, format, enableRT);

   Value spMatC = spGenC->getResult(0);

   token = spGenC->getResult(1);


   // Precompute buffersizes for SpGEMM.

   Operation *descOp =

       rewriter.create<gpu::SpGEMMCreateDescrOp>(loc, descTp, tokenTp, token);

   Value desc = descOp->getResult(0);

   token = descOp->getResult(1);

   Operation *work1 = rewriter.create<gpu::SpGEMMWorkEstimationOrComputeOp>(

       loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,

       gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType, zero,

       valC, gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION);

   Value bufferSz1 = work1->getResult(0);

   token = work1->getResult(1);

   auto buf1 = genAllocBuffer(rewriter, loc, bufferSz1, token);

   Value buffer1 = buf1.getResult(0);

   token = buf1.getAsyncToken();

   Operation *work2 = rewriter.create<gpu::SpGEMMWorkEstimationOrComputeOp>(

       loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,

       gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType,

       bufferSz1, buffer1,

       gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION);

   token = work2->getResult(1);


   // Compute step.

   Operation *compute1 = rewriter.create<gpu::SpGEMMWorkEstimationOrComputeOp>(

       loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,

       gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType, zero,

       valC, gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);

   Value bufferSz2 = compute1->getResult(0);

   token = compute1->getResult(1);

   auto buf2 = genAllocBuffer(rewriter, loc, bufferSz2, token);

   Value buffer2 = buf2.getResult(0);

   token = buf2.getAsyncToken();

   Operation *compute2 = rewriter.create<gpu::SpGEMMWorkEstimationOrComputeOp>(

       loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,

       gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType,

       bufferSz2, buffer2, gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);

   token = compute2->getResult(1);


   // Get sizes.

   Operation *sizes = rewriter.create<gpu::SpMatGetSizeOp>(

       loc, indexTp, indexTp, indexTp, tokenTp, token, spMatC);

   Value nnz = sizes->getResult(2);

   token = sizes->getResult(3);

   auto a2 = genAllocBuffer(rewriter, loc, cTp.getCrdType(), nnz, token);

   colC = a2.getResult(0);

   token = a2.getAsyncToken();

   auto a3 = genAllocBuffer(rewriter, loc, dnCType, nnz, token);

   valC = a3.getResult(0);

   token = a3.getAsyncToken();


   // Update C with new pointers and copy final product back into C.

   Operation *update = rewriter.create<gpu::SetCsrPointersOp>(

       loc, tokenTp, token, spMatC, rowC, colC, valC);

   token = update->getResult(0);

   Operation *copy = rewriter.create<gpu::SpGEMMCopyOp>(

       loc, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,

       gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType);

   token = copy->getResult(0);


   // Allocate buffers on host.

   Value rowH = genHostBuffer(rewriter, loc, cTp.getPosType(), mplus1);

   Value colH = genHostBuffer(rewriter, loc, cTp.getCrdType(), nnz);

   Value valH = genHostBuffer(rewriter, loc, dnCType, nnz);


   // Copy data back to host and free all the resoures.

   token = rewriter.create<gpu::SpGEMMDestroyDescrOp>(loc, tokenTp, token, desc)

               .getAsyncToken();

   token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)

               .getAsyncToken();

   token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatB)

               .getAsyncToken();

   token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)

               .getAsyncToken();

   token = genCopyMemRef(rewriter, loc, rowH, rowC, token);

   token = genCopyMemRef(rewriter, loc, colH, colC, token);

   token = genCopyMemRef(rewriter, loc, valH, valC, token);

   token = genDeallocMemRef(rewriter, loc, rowA, token);

   token = genDeallocMemRef(rewriter, loc, colA, token);

   token = genDeallocMemRef(rewriter, loc, valA, token);

   token = genDeallocMemRef(rewriter, loc, rowB, token);

   token = genDeallocMemRef(rewriter, loc, colB, token);

   token = genDeallocMemRef(rewriter, loc, valB, token);

   token = genDeallocMemRef(rewriter, loc, rowC, token);

   token = genDeallocMemRef(rewriter, loc, colC, token);

   token = genDeallocMemRef(rewriter, loc, valC, token);

   token = genDeallocMemRef(rewriter, loc, buffer1, token);

   token = genDeallocMemRef(rewriter, loc, buffer2, token);

   tokens.push_back(token);

   genBlockingWait(rewriter, loc, tokens);

   tokens.clear();


   // Done.

   Value vt = rewriter.create<bufferization::ToTensorOp>(loc, valH);

   Value rt = rewriter.create<bufferization::ToTensorOp>(loc, rowH);

   Value ct = rewriter.create<bufferization::ToTensorOp>(loc, colH);

   rewriter.replaceOpWithNewOp<AssembleOp>(op, c.getType(), ValueRange{rt, ct},

                                           vt);

   return success();

 }


 // Match and rewrite 2:4 SpMM kernel.

 static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,

                                      linalg::GenericOp op) {

   Location loc = op.getLoc();

   Value A = op.getOperand(0);

   Value B = op.getOperand(1);

   Value C = op.getOperand(2); // we have C = AB

   SmallVector<Value> tokens;


   // The cuSparselt API currently only allows pruning and compression

   // to occur on the device. So we recognize the pattern

   //    A' = convert A  ; dense to 2:4

   //    C  = A'B        ; 2:4 matrix mult

   // and then perform compression and matrix multiplication on device.

   auto cnv = A.getDefiningOp<ConvertOp>();

   assert(cnv);

   A = cnv.getSource();


   // All input should be dense tensors.

   if (!isDenseTensor(A) || !isDenseTensor(B) || !isDenseTensor(C))

     return failure();


   // Start sparse kernel and copy data from host to device.

   //   a : bufA -> matA

   //   b : bufB -> matB

   //   c : bufC -> matC

   Value bufA = genTensorToMemref(rewriter, loc, A);

   Value matA = genAllocCopy(rewriter, loc, bufA, tokens);

   Value bufB = genTensorToMemref(rewriter, loc, B);

   Value matB = genAllocCopy(rewriter, loc, bufB, tokens);

   Value bufC = genTensorToMemref(rewriter, loc, C);

   Value matC = genAllocCopy(rewriter, loc, bufC, tokens);

   genBlockingWait(rewriter, loc, tokens);

   tokens.clear();


   // Create sparse environment and sparse matrix/dense vector handles.

   Value szm = linalg::createOrFoldDimOp(rewriter, loc, matA, 0);

   Value szk = linalg::createOrFoldDimOp(rewriter, loc, matB, 0);

   Value szn = linalg::createOrFoldDimOp(rewriter, loc, matC, 1);

   Type indexTp = rewriter.getIndexType();

   Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();

   Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();

   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();

   Value token = genFirstWait(rewriter, loc);

   Operation *spGenA = rewriter.create<gpu::Create2To4SpMatOp>(

       loc, spMatHandleTp, tokenTp, token, szm, szk,

       gpu::Prune2To4SpMatFlag::PRUNE_AND_CHECK, matA);

   Value spMatA = spGenA->getResult(0);

   token = spGenA->getResult(1);

   auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(

       loc, dnTensorHandleTp, tokenTp, token, matB,

       SmallVector<Value>{szk, szn});

   Value dnB = dmatB.getResult(0);

   token = dmatB.getAsyncToken();

   auto dmatC = rewriter.create<gpu::CreateDnTensorOp>(

       loc, dnTensorHandleTp, tokenTp, token, matC,

       SmallVector<Value>{szm, szn});

   Value dnC = dmatC.getResult(0);

   token = dmatC.getAsyncToken();

   auto dmatCType = llvm::cast<ShapedType>(matC.getType()).getElementType();


   // Precompute buffersize for SpMM.

   SmallVector<Type> bufferTypes_{indexTp, indexTp, indexTp};

   TypeRange bufferTypes(bufferTypes_);

   auto bufferComp = rewriter.create<gpu::SpMMBufferSizeOp>(

       loc, bufferTypes, tokenTp, token, gpu::TransposeMode::NON_TRANSPOSE,

       gpu::TransposeMode::NON_TRANSPOSE, spMatA, dnB, dnC,

       /*computeType=*/dmatCType);

   token = bufferComp.getAsyncToken();


   // Allocate buffers on host.

   Value bufferSz1 = bufferComp.getResult(0);

   auto buf1 = genAllocBuffer(rewriter, loc, bufferSz1, token);

   Value buffer1 = buf1.getResult(0);

   token = buf1.getAsyncToken();

   Value bufferSz2 = bufferComp.getResult(1);

   auto buf2 = genAllocBuffer(rewriter, loc, bufferSz2, token);

   Value buffer2 = buf2.getResult(0);

   token = buf2.getAsyncToken();

   Value bufferSz3 = bufferComp.getResult(2);

   auto buf3 = genAllocBuffer(rewriter, loc, bufferSz3, token);

   Value buffer3 = buf3.getResult(0);

   token = buf3.getAsyncToken();


   // Perform the SpMM.

   auto dnCType = llvm::cast<ShapedType>(matC.getType()).getElementType();

   auto spmmComp = rewriter.create<gpu::SpMMOp>(

       loc, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dnCType,

       SmallVector<Value>{buffer1, buffer2, buffer3});

   token = spmmComp.getAsyncToken();


   // Copy data back to host and free all the resources.

   token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)

               .getAsyncToken();

   token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)

               .getAsyncToken();

   token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)

               .getAsyncToken();

   token = genDeallocMemRef(rewriter, loc, buffer1, token);

   token = genDeallocMemRef(rewriter, loc, buffer2, token);

   token = genDeallocMemRef(rewriter, loc, buffer3, token);

   token = genDeallocMemRef(rewriter, loc, matA, token);

   token = genDeallocMemRef(rewriter, loc, matB, token);

   token = genCopyMemRef(rewriter, loc, bufC, matC, token);

   token = genDeallocMemRef(rewriter, loc, matC, token);

   tokens.push_back(token);

   genBlockingWait(rewriter, loc, tokens);

   tokens.clear();


   // Done.

   rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, bufC);

   return success();

 }


 /// Match and rewrite SDDMM kernel.

 static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,

                                   linalg::GenericOp op, bool enableRT) {

   Location loc = op.getLoc();

   Value a = op.getOperand(0);

   Value b = op.getOperand(1);

   Value c = op.getOperand(2);

   SmallVector<Value> tokens;


   // Only admissible sparse matrix format (no COO/CSC) and dense matrices.

   SparseTensorType aTp = getSparseTensorType(a);

   SparseTensorType bTp = getSparseTensorType(b);

   SparseTensorType cTp = getSparseTensorType(c);

   auto format = getCuSparseFormat(cTp, bTp, aTp, enableRT, /*isMatVec=*/false);

   if (format == CuSparseFormat::kNone || format == CuSparseFormat::kCOO ||

       format == CuSparseFormat::kCSC)

     return failure();


   // The SDDMM does the in-place operation.

   // Start sparse kernel and copy data from host to device.

   //   a : bufA           -> matA

   //   b : bufB           -> matB

   //   c : memR/memC/memV -> rowC,colC,valC

   Value nseC = rewriter.create<NumberOfEntriesOp>(loc, c);

   Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);

   Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);

   Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);

   Value bufA = genTensorToMemref(rewriter, loc, a);

   Value matA = genAllocCopy(rewriter, loc, bufA, tokens);

   Value bufB = genTensorToMemref(rewriter, loc, b);

   Value matB = genAllocCopy(rewriter, loc, bufB, tokens);

   Value memR = genFirstPosOrCrds(rewriter, loc, c, format, enableRT);

   Value memC = genSecondCrds(rewriter, loc, c, format, enableRT); // or empty

   Value memV = rewriter.create<ToValuesOp>(loc, c);

   Value rowC = genAllocCopy(rewriter, loc, memR, tokens);

   Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();

   Value valC = genAllocCopy(rewriter, loc, memV, tokens);

   genBlockingWait(rewriter, loc, tokens);

   tokens.clear();


   // Create sparse environment and sparse matrix/dense matrix handles.

   Type indexTp = rewriter.getIndexType();

   Type dnMatHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();

   Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();

   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();

   Value token = genFirstWait(rewriter, loc);

   auto dmatA = rewriter.create<gpu::CreateDnTensorOp>(

       loc, dnMatHandleTp, tokenTp, token, matA, SmallVector<Value>{szm, szk});

   Value dnA = dmatA.getResult(0);

   token = dmatA.getAsyncToken();

   auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(

       loc, dnMatHandleTp, tokenTp, token, matB, SmallVector<Value>{szk, szn});

   Value dnB = dmatB.getResult(0);

   token = dmatB.getAsyncToken();

   Operation *spGenC =

       genSpMat(rewriter, loc, cTp, spMatHandleTp, tokenTp, token, szm, szn,

                nseC, rowC, colC, valC, format, enableRT);

   Value spMatC = spGenC->getResult(0);

   token = spGenC->getResult(1);

   auto dnCType = llvm::cast<ShapedType>(c.getType()).getElementType();


   // Precompute buffersize for SDDMM.

   auto bufferComp = rewriter.create<gpu::SDDMMBufferSizeOp>(

       loc, indexTp, tokenTp, token, dnA, dnB, spMatC, dnCType);

   Value bufferSz = bufferComp.getResult(0);

   token = bufferComp.getAsyncToken();

   auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);

   Value buffer = buf.getResult(0);

   token = buf.getAsyncToken();


   // Perform the SDDMM.

   auto sddmmComp = rewriter.create<gpu::SDDMMOp>(loc, tokenTp, token, dnA, dnB,

                                                  spMatC, dnCType, buffer);

   token = sddmmComp.getAsyncToken();


   // Copy data back to host and free all the resoures.

   token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnA)

               .getAsyncToken();

   token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)

               .getAsyncToken();

   token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)

               .getAsyncToken();

   token = genDeallocMemRef(rewriter, loc, buffer, token);

   token = genDeallocMemRef(rewriter, loc, matA, token);

   token = genDeallocMemRef(rewriter, loc, matB, token);

   token = genDeallocMemRef(rewriter, loc, rowC, token);

   if (colC)

     token = genDeallocMemRef(rewriter, loc, colC, token);

   token = genCopyMemRef(rewriter, loc, memV, valC, token);

   token = genDeallocMemRef(rewriter, loc, valC, token);

   tokens.push_back(token);

   genBlockingWait(rewriter, loc, tokens);

   tokens.clear();


   // Done.

   rewriter.replaceOpWithNewOp<sparse_tensor::LoadOp>(op, c);

   return success();

 }


 //===----------------------------------------------------------------------===//

 // Rewriting rules for direct code generation.

 //===----------------------------------------------------------------------===//


 /// Proof-of-concept rewriter. This rule generates a GPU implementation

 /// for each outermost forall loop generated by the sparsifier.

 /// TODO: right now works with parallelization-strategy=dense-outer-loop

 ///       but give this its own flags in the future

 struct ForallRewriter : public OpRewritePattern<scf::ParallelOp> {

   using OpRewritePattern<scf::ParallelOp>::OpRewritePattern;


   ForallRewriter(MLIRContext *context, unsigned nT)

       : OpRewritePattern(context), numThreads(nT){};


   LogicalResult matchAndRewrite(scf::ParallelOp forallOp,

                                 PatternRewriter &rewriter) const override {

     // Reject inadmissible loop form.

     // Essentially only accept a loop, generated by the sparsifier,

     // of the form

     //   forall (i = 0; i < N; i++)

     // so that cyclic scheduling over the threads is easy.

     if (!forallOp->hasAttr(LoopEmitter::getLoopEmitterLoopAttrName()) ||

         forallOp.getNumReductions() != 0 || forallOp.getNumLoops() != 1 ||

         !matchPattern(forallOp.getLowerBound()[0], m_Zero()) ||

         !matchPattern(forallOp.getStep()[0], m_One()))

       return failure();

     // Collect every value that is computed outside the parallel loop.

     SetVector<Value> invariants; // stable iteration!

     forallOp->walk([&](Operation *op) {

       // Collect all values of admissible ops.

       for (OpOperand &o : op->getOpOperands()) {

         Value val = o.get();

         Block *block;

         if (auto arg = dyn_cast<BlockArgument>(val))

           block = arg.getOwner();

         else

           block = val.getDefiningOp()->getBlock();

         if (!forallOp.getRegion().findAncestorBlockInRegion(*block))

           invariants.insert(val);

       }

     });

     // Outline the outside values as proper parameters. Fail when sharing

     // value between host and device is not straightforward.

     SmallVector<Value> constants;

     SmallVector<Value> scalars;

     SmallVector<Value> buffers;

     for (Value val : invariants) {

       Type tp = val.getType();

       if (val.getDefiningOp<arith::ConstantOp>())

         constants.push_back(val);

       else if (isa<FloatType>(tp) || tp.isIntOrIndex())

         scalars.push_back(val);

       else if (isa<MemRefType>(tp))

         buffers.push_back(val);

       else

         return failure(); // don't know how to share

     }

     // Pass outlined non-constant values.

     // TODO: Experiment with `useHostRegistrationForOut` to see if we want to

     //       keep the feature at all (either through a heuristic or compiler

     //       option for gpu codegen).

     Location loc = forallOp->getLoc();

     SmallVector<Value> args;

     SmallVector<Value> tokens;

     Value out = genParametersIn(rewriter, loc, scalars, buffers, args, tokens,

                                 /*useHostRegistrationForOut=*/false);

     // Set up GPU module and construct GPU function.

     auto saveIp = rewriter.saveInsertionPoint();

     ModuleOp topModule = forallOp->getParentOfType<ModuleOp>();

     auto gpuModule = genGPUModule(rewriter, topModule);

     auto gpuFunc = genGPUFunc(rewriter, gpuModule, args);

     genGPUCode(rewriter, gpuFunc, forallOp, constants, scalars, buffers);

     // Generate code that launches the kernel asynchronously, blocking on all

     // opens tokens and yielding a new token for the output.

     // TODO: Passing in tokens to launch up does not seem to be properly lowered

     //       by cubin yet, hence the current blocking wait.

     rewriter.restoreInsertionPoint(saveIp);

     genBlockingWait(rewriter, loc, tokens);

     tokens.clear();

     Value kernelToken =

         genLaunchGPUFunc(rewriter, gpuFunc, args, tokens, numThreads);

     // Finalize the outlined arguments.

     genParametersOut(rewriter, loc, out, kernelToken, scalars, buffers, args,

                      tokens);

     genBlockingWait(rewriter, loc, tokens);

     rewriter.eraseOp(forallOp);

     return success();

   }


 private:

   unsigned numThreads;

 };


 //===----------------------------------------------------------------------===//

 // Rewriting rules for library recognition and code generation.

 //===----------------------------------------------------------------------===//


 /// Proof-of-concept rewriter. This rule recognizes certain math kernels

 /// and replaces these with corresponding calls into a sparse library.

 struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {

   using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;


   LinalgOpRewriter(MLIRContext *context, bool rt)

       : OpRewritePattern(context), enableRT(rt) {}


   LogicalResult matchAndRewrite(linalg::GenericOp op,

                                 PatternRewriter &rewriter) const override {

     if (op.getNumDpsInits() != 1)

       return failure(); // reject multi-output


     const unsigned numLoops = op.getNumLoops();

     const unsigned numTensors = op->getNumOperands();

     const auto iteratorTypes = op.getIteratorTypesArray();

     SmallVector<AffineMap, 4> maps = op.getIndexingMapsArray();


     using MapList = ArrayRef<ArrayRef<AffineExpr>>;

     auto infer = [&](MapList m) {

       return AffineMap::inferFromExprList(m, op.getContext());

     };

     AffineExpr i, j, k;

     bindDims(getContext(), i, j, k);


     // TODO: more robust patterns, transposed versions, more kernels,

     //       identify alpha and beta and pass them to the CUDA calls.


     // Recognize a SpMV kernel.

     if (numLoops == 2 && numTensors == 3 &&

         linalg::isParallelIterator(iteratorTypes[0]) &&

         linalg::isReductionIterator(iteratorTypes[1]) &&

         maps == infer({{i, j}, {j}, {i}}) && matchSumOfMultOfArgs(op)) {

       return rewriteSpMV(rewriter, op, enableRT);

     }


     // Recognize a SpGEMM, 2:4-SpMM, or SpMM kernel.

     if (numLoops == 3 && numTensors == 3 &&

         linalg::isParallelIterator(iteratorTypes[0]) &&

         linalg::isParallelIterator(iteratorTypes[1]) &&

         linalg::isReductionIterator(iteratorTypes[2]) &&

         maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumOfMultOfArgs(op)) {

       if (!isDenseTensor(op.getOperand(0)) && !isDenseTensor(op.getOperand(1)))

         return rewriteSpGEMM(rewriter, op, enableRT);

       if (isConversionInto24(op.getOperand(0)))

         return rewrite2To4SpMM(rewriter, op);

       return rewriteSpMM(rewriter, op, enableRT);

     }


     // Recognize a SDDMM kernel.

     if (numLoops == 3 && numTensors == 3 &&

         linalg::isParallelIterator(iteratorTypes[0]) &&

         linalg::isParallelIterator(iteratorTypes[1]) &&

         linalg::isReductionIterator(iteratorTypes[2]) &&

         maps == infer({{i, k}, {k, j}, {i, j}}) &&

         matchSumReductionOfMulUnary(op)) {

       return rewriteSDDMM(rewriter, op, enableRT);

     }


     return failure();

   }


 private:

   bool enableRT;

 };


 } // namespace


 //===----------------------------------------------------------------------===//

 // Public method for populating GPU rewriting rules.

 //

 // Currently two set of rewriting rules are made available. The first set

 // implements direct code generation, currently by means of convering the

 // outermost paralell loop into GPU threads. The second set implements

 // libary recognition of a set of sparse operations. Eventually, the right

 // combination of these two approaches has to be found.

 //===----------------------------------------------------------------------===//


 void mlir::populateSparseGPUCodegenPatterns(RewritePatternSet &patterns,

                                             unsigned numThreads) {

   patterns.add<ForallRewriter>(patterns.getContext(), numThreads);

 }


 void mlir::populateSparseGPULibgenPatterns(RewritePatternSet &patterns,

                                            bool enableRT) {

   patterns.add<LinalgOpRewriter>(patterns.getContext(), enableRT);

 }

Bufferization.h

CodegenUtils.h

copy
static void copy(Location loc, Value dst, Value src, Value size, OpBuilder &builder)
Copies the given number of bytes from src to dst pointers.
Definition: ConvertLaunchFuncToLLVMCalls.cpp:71

Utils.h

Passes.h

GPUDialect.h

IRMapping.h

getContext
static MLIRContext * getContext(OpFoldResult val)
Definition: IndexingUtils.cpp:295

LoopEmitter.h

Matchers.h

SparseTensorType.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::SetVector
Definition: LLVM.h:66

llvm::SmallString
Definition: LLVM.h:41

llvm::SmallVectorImpl
Definition: LLVM.h:74

llvm::SmallVector
Definition: LLVM.h:72

mlir::AffineExpr
Base type for affine expression.
Definition: AffineExpr.h:68

mlir::AffineMap::inferFromExprList
static SmallVector< AffineMap, 4 > inferFromExprList(ArrayRef< ArrayRef< AffineExpr >> exprsList, MLIRContext *context)
Returns a vector of AffineMaps; each with as many results as exprs.size(), as many dims as the larges...
Definition: AffineMap.cpp:312

mlir::Block
Block represents an ordered list of Operations.
Definition: Block.h:33

mlir::Block::getArgument
BlockArgument getArgument(unsigned i)
Definition: Block.h:129

mlir::Block::getArguments
BlockArgListType getArguments()
Definition: Block.h:87

mlir::Block::front
Operation & front()
Definition: Block.h:153

mlir::Builder::getUnitAttr
UnitAttr getUnitAttr()
Definition: Builders.cpp:94

mlir::Builder::getType
Ty getType(Args &&...args)
Get or construct an instance of the type Ty with provided arguments.
Definition: Builders.h:88

mlir::Builder::getIndexType
IndexType getIndexType()
Definition: Builders.cpp:51

mlir::Builder::getI8Type
IntegerType getI8Type()
Definition: Builders.cpp:59

mlir::IRMapping
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26

mlir::IRMapping::lookup
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72

mlir::IRMapping::map
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60

mlir::OpBuilder
This class helps build Operations.
Definition: Builders.h:204

mlir::OpBuilder::saveInsertionPoint
InsertPoint saveInsertionPoint() const
Return a saved insertion point.
Definition: Builders.h:382

mlir::OpBuilder::clone
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:549

mlir::OpBuilder::setInsertionPointToStart
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:428

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:395

mlir::OpBuilder::restoreInsertionPoint
void restoreInsertionPoint(InsertPoint ip)
Restore the insert point to a previously saved point.
Definition: Builders.h:387

mlir::OpBuilder::cloneRegionBefore
void cloneRegionBefore(Region &region, Region &parent, Region::iterator before, IRMapping &mapping)
Clone the blocks that belong to "region" before the given position in another region "parent".
Definition: Builders.cpp:576

mlir::OpBuilder::create
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:453

mlir::OpBuilder::setInsertionPointAfter
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:409

mlir::OpOperand
This class represents an operand of an operation.
Definition: Value.h:257

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::getResult
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:407

mlir::Operation::setAttr
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
Definition: Operation.h:582

mlir::Operation::getOpOperands
MutableArrayRef< OpOperand > getOpOperands()
Definition: Operation.h:383

mlir::PatternRewriter
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:749

mlir::RewritePatternSet
Definition: PatternMatch.h:772

mlir::RewriterBase::eraseBlock
virtual void eraseBlock(Block *block)
This method erases all operations in a block.
Definition: PatternMatch.cpp:229

mlir::RewriterBase::eraseOp
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Definition: PatternMatch.cpp:157

mlir::RewriterBase::replaceOpWithNewOp
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
Definition: PatternMatch.h:500

mlir::TypeRange
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:37

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::Type::isIntOrIndex
bool isIntOrIndex() const
Return true if this is an integer (of any signedness) or an index type.
Definition: Types.cpp:112

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:105

mlir::Value::getDefiningOp
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20

mlir::gpu::AsyncTokenType
Definition: GPUDialect.h:46

mlir::gpu::SparseDnTensorHandleType
Definition: GPUDialect.h:179

mlir::gpu::SparseSpGEMMOpHandleType
Definition: GPUDialect.h:199

mlir::gpu::SparseSpMatHandleType
Definition: GPUDialect.h:189

mlir::sparse_tensor::LoopEmitter::getLoopEmitterLoopAttrName
constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName()
Definition: LoopEmitter.h:243

mlir::sparse_tensor::SparseTensorType
A wrapper around RankedTensorType, which has three goals:
Definition: SparseTensorType.h:46

mlir::sparse_tensor::SparseTensorType::isSingletonLvl
bool isSingletonLvl(Level l) const
Definition: SparseTensorType.h:314

mlir::sparse_tensor::SparseTensorType::getElementType
Type getElementType() const
Definition: SparseTensorType.h:170

mlir::sparse_tensor::SparseTensorType::getCrdWidth
unsigned getCrdWidth() const
Returns the coordinate-overhead bitwidth, defaulting to zero.
Definition: SparseTensorType.h:322

mlir::sparse_tensor::SparseTensorType::hasEncoding
bool hasEncoding() const
Returns true for tensors which have an encoding, and false for those which do not.
Definition: SparseTensorType.h:181

mlir::sparse_tensor::SparseTensorType::getDimRank
Dimension getDimRank() const
Returns the dimension-rank.
Definition: SparseTensorType.h:239

mlir::sparse_tensor::SparseTensorType::isNOutOfMLvl
bool isNOutOfMLvl(Level l) const
Definition: SparseTensorType.h:315

mlir::sparse_tensor::SparseTensorType::getCrdType
Type getCrdType() const
Returns the coordinate-overhead MLIR type, defaulting to IndexType.
Definition: SparseTensorType.h:338

mlir::sparse_tensor::SparseTensorType::isIdentity
bool isIdentity() const
Returns true if the dimToLvl mapping is the identity.
Definition: SparseTensorType.h:203

mlir::sparse_tensor::SparseTensorType::isCompressedLvl
bool isCompressedLvl(Level l) const
Definition: SparseTensorType.h:310

mlir::sparse_tensor::SparseTensorType::getLvlRank
Level getLvlRank() const
Returns the level-rank.
Definition: SparseTensorType.h:242

mlir::sparse_tensor::SparseTensorType::getPosWidth
unsigned getPosWidth() const
Returns the position-overhead bitwidth, defaulting to zero.
Definition: SparseTensorType.h:325

mlir::sparse_tensor::SparseTensorType::isPermutation
bool isPermutation() const
Returns true if the dimToLvl mapping is a permutation.
Definition: SparseTensorType.h:199

mlir::sparse_tensor::SparseTensorType::isDenseLvl
bool isDenseLvl(Level l) const
Definition: SparseTensorType.h:309

mlir::sparse_tensor::SparseTensorType::getDimToLvl
AffineMap getDimToLvl() const
Returns the dimToLvl mapping (or the null-map for the identity).
Definition: SparseTensorType.h:212

mlir::sparse_tensor::SparseTensorType::getPosType
Type getPosType() const
Returns the position-overhead MLIR type, defaulting to IndexType.
Definition: SparseTensorType.h:341

mlir::sparse_tensor::SparseTensorType::isOrderedLvl
bool isOrderedLvl(Level l) const
Definition: SparseTensorType.h:316

mlir::sparse_tensor::SparseTensorType::isUniqueLvl
bool isUniqueLvl(Level l) const
Definition: SparseTensorType.h:317

Linalg.h

MemRef.h

SCF.h

SparseTensor.h

mlir::linalg::isParallelIterator
bool isParallelIterator(utils::IteratorType iteratorType)
Check if iterator type has "parallel" semantics.
Definition: Utils.cpp:238

mlir::linalg::isReductionIterator
bool isReductionIterator(utils::IteratorType iteratorType)
Check if iterator type has "reduction" semantics.
Definition: Utils.cpp:242

mlir::linalg::createOrFoldDimOp
Value createOrFoldDimOp(OpBuilder &b, Location loc, Value val, int64_t dim)
Create one memref::DimOp or tensor::DimOp depending on the type of val.
Definition: LinalgOps.cpp:99

mlir::nvgpu::MatMulOperandRole::C
@ C

mlir::nvgpu::MatMulOperandRole::A
@ A

mlir::nvgpu::MatMulOperandRole::B
@ B

mlir::sparse_tensor
Definition: Enums.h:41

mlir::sparse_tensor::constantIndex
Value constantIndex(OpBuilder &builder, Location loc, int64_t i)
Generates a constant of index type.
Definition: CodegenUtils.h:331

mlir::sparse_tensor::getSparseTensorType
SparseTensorType getSparseTensorType(Value val)
Convenience methods to obtain a SparseTensorType from a Value.
Definition: SparseTensorType.h:374

mlir::sparse_tensor::getBlockSize
SmallVector< unsigned > getBlockSize(AffineMap dimToLvl)
Given the dimToLvl map, returns the block sizes in a vector.
Definition: SparseTensorDialect.cpp:1112

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::matchPattern
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
Definition: Matchers.h:490

mlir::TypedValue
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition: Value.h:488

mlir::bindDims
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:311

mlir::populateSparseGPULibgenPatterns
void populateSparseGPULibgenPatterns(RewritePatternSet &patterns, bool enableRT)
Definition: SparseGPUCodegen.cpp:1329

mlir::SparseParallelizationStrategy::kNone
@ kNone

mlir::m_Zero
detail::constant_int_predicate_matcher m_Zero()
Matches a constant scalar / vector splat / tensor splat integer zero.
Definition: Matchers.h:442

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition: GreedyPatternRewriteDriver.h:283

mlir::m_One
detail::constant_int_predicate_matcher m_One()
Matches a constant scalar / vector splat / tensor splat integer one.
Definition: Matchers.h:478

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::populateSparseGPUCodegenPatterns
void populateSparseGPUCodegenPatterns(RewritePatternSet &patterns, unsigned numThreads)
Definition: SparseGPUCodegen.cpp:1324

UnrankedMemRefType
Definition: CRunnerUtils.h:311

mlir::OpRewritePattern
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:314

mlir::gpu::KernelDim3
Utility class for the GPU dialect to represent triples of Values accessible through ....
Definition: GPUDialect.h:39

j
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.