mlir.dialects._nvgpu_ops_gen
============================

.. py:module:: mlir.dialects._nvgpu_ops_gen


Attributes
----------

.. autoapisummary::

   mlir.dialects._nvgpu_ops_gen._ods_ir


Classes
-------

.. autoapisummary::

   mlir.dialects._nvgpu_ops_gen._Dialect
   mlir.dialects._nvgpu_ops_gen.DeviceAsyncCopyOp
   mlir.dialects._nvgpu_ops_gen.DeviceAsyncCreateGroupOp
   mlir.dialects._nvgpu_ops_gen.DeviceAsyncWaitOp
   mlir.dialects._nvgpu_ops_gen.LdMatrixOp
   mlir.dialects._nvgpu_ops_gen.MBarrierArriveExpectTxOp
   mlir.dialects._nvgpu_ops_gen.MBarrierArriveNoCompleteOp
   mlir.dialects._nvgpu_ops_gen.MBarrierArriveOp
   mlir.dialects._nvgpu_ops_gen.MBarrierCreateOp
   mlir.dialects._nvgpu_ops_gen.MBarrierGetOp
   mlir.dialects._nvgpu_ops_gen.MBarrierInitOp
   mlir.dialects._nvgpu_ops_gen.MBarrierTestWaitOp
   mlir.dialects._nvgpu_ops_gen.MBarrierTryWaitParityOp
   mlir.dialects._nvgpu_ops_gen.MmaSparseSyncOp
   mlir.dialects._nvgpu_ops_gen.MmaSyncOp
   mlir.dialects._nvgpu_ops_gen.RcpOp
   mlir.dialects._nvgpu_ops_gen.TmaAsyncLoadOp
   mlir.dialects._nvgpu_ops_gen.TmaAsyncStoreOp
   mlir.dialects._nvgpu_ops_gen.TmaCreateDescriptorOp
   mlir.dialects._nvgpu_ops_gen.TmaFenceOp
   mlir.dialects._nvgpu_ops_gen.TmaPrefetchOp
   mlir.dialects._nvgpu_ops_gen.WarpgroupGenerateDescriptorOp
   mlir.dialects._nvgpu_ops_gen.WarpgroupMmaInitAccumulatorOp
   mlir.dialects._nvgpu_ops_gen.WarpgroupMmaOp
   mlir.dialects._nvgpu_ops_gen.WarpgroupMmaStoreOp


Functions
---------

.. autoapisummary::

   mlir.dialects._nvgpu_ops_gen.device_async_copy
   mlir.dialects._nvgpu_ops_gen.device_async_create_group
   mlir.dialects._nvgpu_ops_gen.device_async_wait
   mlir.dialects._nvgpu_ops_gen.ldmatrix
   mlir.dialects._nvgpu_ops_gen.mbarrier_arrive_expect_tx
   mlir.dialects._nvgpu_ops_gen.mbarrier_arrive_nocomplete
   mlir.dialects._nvgpu_ops_gen.mbarrier_arrive
   mlir.dialects._nvgpu_ops_gen.mbarrier_create
   mlir.dialects._nvgpu_ops_gen.mbarrier_get
   mlir.dialects._nvgpu_ops_gen.mbarrier_init
   mlir.dialects._nvgpu_ops_gen.mbarrier_test_wait
   mlir.dialects._nvgpu_ops_gen.mbarrier_try_wait_parity
   mlir.dialects._nvgpu_ops_gen.mma_sp_sync
   mlir.dialects._nvgpu_ops_gen.mma_sync
   mlir.dialects._nvgpu_ops_gen.rcp
   mlir.dialects._nvgpu_ops_gen.tma_async_load
   mlir.dialects._nvgpu_ops_gen.tma_async_store
   mlir.dialects._nvgpu_ops_gen.tma_create_descriptor
   mlir.dialects._nvgpu_ops_gen.tma_fence_descriptor
   mlir.dialects._nvgpu_ops_gen.tma_prefetch_descriptor
   mlir.dialects._nvgpu_ops_gen.warpgroup_generate_descriptor
   mlir.dialects._nvgpu_ops_gen.warpgroup_mma_init_accumulator
   mlir.dialects._nvgpu_ops_gen.warpgroup_mma
   mlir.dialects._nvgpu_ops_gen.warpgroup_mma_store


Module Contents
---------------

.. py:data:: _ods_ir

.. py:class:: _Dialect(descriptor: object)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: DIALECT_NAMESPACE
      :value: 'nvgpu'


.. py:class:: DeviceAsyncCopyOp(dst, dstIndices, src, srcIndices, dstElements, *, srcElements=None, bypassL1=None, results=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvgpu.device_async_copy`` op initiates an asynchronous copy operation of
   elements from source (global memory) to the destination (shared memory)
   without blocking the thread. The async copy is added to a group.

   This op is meant to be used with ``nvgpu.device_async_create_group`` and
   ``nvgpu.device_async_wait`` to synchronize copies as explained in those ops
   descriptions.

   ``bypassL1`` attribute is hint to the hardware to bypass the L1 cache during
   async copy, this hint may be ignored by the hardware.

   ``dstElements`` attribute is the total number of elements written to
   destination (shared memory).

   ``srcElements`` argument is the total number of elements read from
   source (global memory).

   ``srcElements`` is an optional argument and when present the op only reads
   ``srcElements`` number of elements from the source (global memory) and zero fills
   the rest of the elements in the destination (shared memory).

   In order to do a copy and wait for the result we need the following
   combination:

   .. code::

       // copy 1.
       %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
       // copy 2.
       %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
       // group 1 contains copy 1 and copy 2.
       %token1 = nvgpu.device_async_create_group %cp1, %cp2
       // copy 3.
       %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
       // group 2 contains copy 3.
       %token2 = nvgpu.device_async_create_group %cp3
       // after the wait copy 1 and copy 2 are complete.
       nvgpu.device_async_wait %token1
       // after the wait copy 3 is complete.
       nvgpu.device_async_wait %token2

   Example:

   .. code:: mlir

       %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
         memref<4x5xf32> to memref<2x7x5xf32, 3>


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.device_async_copy'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: dst() -> _ods_ir


   .. py:method:: dstIndices() -> _ods_ir


   .. py:method:: src() -> _ods_ir


   .. py:method:: srcIndices() -> _ods_ir


   .. py:method:: srcElements() -> Optional[_ods_ir]


   .. py:method:: dstElements() -> _ods_ir


   .. py:method:: bypassL1() -> bool


   .. py:method:: asyncToken() -> _ods_ir


.. py:function:: device_async_copy(dst, dst_indices, src, src_indices, dst_elements, *, src_elements=None, bypass_l1=None, results=None, loc=None, ip=None) -> _ods_ir

.. py:class:: DeviceAsyncCreateGroupOp(inputTokens, *, results=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvgpu.device_async_create_group`` op creates a group of memory accesses
   containing all the pending ``device_async_copy`` operations associated with
   argument tokens. Each token can only be part of one group.

   It returns a token that can be use to wait until the group fully completes.

   This is meant to be used with ``nvgpu.device_async_wait`` to synchronize copies
   as explained in those ops descriptions.

   Groups are executed in the order they are created.

   Example:

   .. code:: mlir

       %0 = nvgpu.device_async_create_group


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.device_async_create_group'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: inputTokens() -> _ods_ir


   .. py:method:: asyncToken() -> _ods_ir


.. py:function:: device_async_create_group(input_tokens, *, results=None, loc=None, ip=None) -> _ods_ir

.. py:class:: DeviceAsyncWaitOp(asyncDependencies, *, numGroups=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvgpu.device_async_wait`` op will block the execution thread until the group
   associated with the source token is fully completed.

   The optional ``$numGroups`` attribute gives an upper bound of the number of
   groups uncompleted when the wait can unblock the thread. For example,  if
   16 async groups are pushe and ``$numGroups`` is set to 12, then the thread
   will unblock when 12 groups or fewer are in flight (4 groups have
   completed).

   Example:

   .. code:: mlir

       nvgpu.device_async_wait %0


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.device_async_wait'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: asyncDependencies() -> _ods_ir


   .. py:method:: numGroups() -> Optional[_ods_ir]


.. py:function:: device_async_wait(async_dependencies, *, num_groups=None, loc=None, ip=None) -> DeviceAsyncWaitOp

.. py:class:: LdMatrixOp(res, srcMemref, indices, transpose, numTiles, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvgpu.ldmatrix`` op represents loading a matrix fragment from
   memory to registers. The source and result type must be compatible
   with lowering to the ``nvvm.ldmatrix`` instruction. This op represents
   the distributed version of a ``vector.transfer_read`` as an intermediate
   step between lowering from ``vector.transfer_read`` to ``nvvm.ldmatrix``.

   This operation is meant to follow the semantic of described here:
   https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix

   Example:

   .. code:: mlir

       %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
         memref<?x?xf16, 3> -> vector<4x2xf16>


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.ldmatrix'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: srcMemref() -> _ods_ir


   .. py:method:: indices() -> _ods_ir


   .. py:method:: transpose() -> _ods_ir


   .. py:method:: numTiles() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: ldmatrix(res, src_memref, indices, transpose, num_tiles, *, loc=None, ip=None) -> _ods_ir

.. py:class:: MBarrierArriveExpectTxOp(barriers, txcount, mbarId, *, predicate=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   A thread executing the Op performs an expect-tx operation on the mbarrier
   object at the location specified by the address operand $barrier. The
   expect-tx operation, with an $txcount argument, increases the tx-count of
   an mbarrier object by the value specified by $txcount. This makes the
   current phase of the mbarrier object to expect and track the completion of
   additional asynchronous transactions.

   The ``$txCount`` specifies the number of element to the expect-tx operation.

   Example:

   .. code:: mlir

         nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.mbarrier.arrive.expect_tx'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: barriers() -> _ods_ir


   .. py:method:: txcount() -> _ods_ir


   .. py:method:: mbarId() -> _ods_ir


   .. py:method:: predicate() -> Optional[_ods_ir]


.. py:function:: mbarrier_arrive_expect_tx(barriers, txcount, mbar_id, *, predicate=None, loc=None, ip=None) -> MBarrierArriveExpectTxOp

.. py:class:: MBarrierArriveNoCompleteOp(barriers, mbarId, count, *, results=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The Op performs arrive-on operation on the ``mbarrier`` object and returns a
   ``nvgpu.mbarrier.token``.

   The Op does not cause the ``nvgpu.mbarrier`` to complete its current phase.

   Example:

   .. code:: mlir

         %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.mbarrier.arrive.nocomplete'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: barriers() -> _ods_ir


   .. py:method:: mbarId() -> _ods_ir


   .. py:method:: count() -> _ods_ir


   .. py:method:: token() -> _ods_ir


.. py:function:: mbarrier_arrive_nocomplete(barriers, mbar_id, count, *, results=None, loc=None, ip=None) -> _ods_ir

.. py:class:: MBarrierArriveOp(barriers, mbarId, *, results=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The Op performs arrive-on operation on the ``mbarrier`` object and returns a
   ``nvgpu.mbarrier.token``.

   For more information, see
   https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object

   Example:

   .. code:: mlir

         %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.mbarrier.arrive'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: barriers() -> _ods_ir


   .. py:method:: mbarId() -> _ods_ir


   .. py:method:: token() -> _ods_ir


.. py:function:: mbarrier_arrive(barriers, mbar_id, *, results=None, loc=None, ip=None) -> _ods_ir

.. py:class:: MBarrierCreateOp(barriers, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The Op generates one or more ``mbarrier`` object, which is a barrier created in
   shared memory and supports various synchronization behaviors for threads.

   The ``mbarrier`` object has the following type and alignment requirements:
   Type: .b64, Alignment: 8, Memory space: .shared

   Example:

   .. code:: mlir

         %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.mbarrier.create'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: barriers() -> _ods_ir


.. py:function:: mbarrier_create(barriers, *, loc=None, ip=None) -> _ods_ir

.. py:class:: MBarrierGetOp(mbarrierPointer, barriers, mbarId, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvgpu.mbarrier.get`` operation retrieves a pointer to a specific
   ``mbarrier`` object from a group of barriers created by the ``nvgpu.mbarrier.create`` operation.

   Example:

   .. code:: mlir

         %mbars = nvgpu.mbarrier.create -> !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>, num_barriers = 10>
         %mbar_pointer = nvgpu.mbarrier.get %mbars[%c2] : !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.mbarrier.get'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: barriers() -> _ods_ir


   .. py:method:: mbarId() -> _ods_ir


   .. py:method:: mbarrierPointer() -> _ods_ir


.. py:function:: mbarrier_get(mbarrier_pointer, barriers, mbar_id, *, loc=None, ip=None) -> _ods_ir

.. py:class:: MBarrierInitOp(barriers, count, mbarId, *, predicate=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The Op initializes the ``mbarrier`` object with the given number of threads.

   Example:

   .. code:: mlir

         %num_threads = gpu.block_dim x
         %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
         nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.mbarrier.init'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: barriers() -> _ods_ir


   .. py:method:: count() -> _ods_ir


   .. py:method:: mbarId() -> _ods_ir


   .. py:method:: predicate() -> Optional[_ods_ir]


.. py:function:: mbarrier_init(barriers, count, mbar_id, *, predicate=None, loc=None, ip=None) -> MBarrierInitOp

.. py:class:: MBarrierTestWaitOp(barriers, token, mbarId, *, results=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Checks whether the mbarrier object has completed the phase. It is is a
   non-blocking instruction which tests for the completion of the phase.

   Example:

   .. code:: mlir

         %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.mbarrier.test.wait'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: barriers() -> _ods_ir


   .. py:method:: token() -> _ods_ir


   .. py:method:: mbarId() -> _ods_ir


   .. py:method:: waitComplete() -> _ods_ir


.. py:function:: mbarrier_test_wait(barriers, token, mbar_id, *, results=None, loc=None, ip=None) -> _ods_ir

.. py:class:: MBarrierTryWaitParityOp(barriers, phaseParity, ticks, mbarId, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Checks whether the mbarrier object has completed the phase. It is is a
   potentially blocking instruction which tests for the completion of the
   phase. Suspended thread resumes execution when the specified phase completes
   OR before the phase completes following a system-dependent time limit.

   The ``$phaseParity`` specifies either even phase (0) or odd phase (1) to
   wait.

   Example:

   .. code:: mlir

         nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.mbarrier.try_wait.parity'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: barriers() -> _ods_ir


   .. py:method:: phaseParity() -> _ods_ir


   .. py:method:: ticks() -> _ods_ir


   .. py:method:: mbarId() -> _ods_ir


.. py:function:: mbarrier_try_wait_parity(barriers, phase_parity, ticks, mbar_id, *, loc=None, ip=None) -> MBarrierTryWaitParityOp

.. py:class:: MmaSparseSyncOp(res, matrixA, matrixB, matrixC, sparseMetadata, mmaShape, *, sparsitySelector=None, tf32Enabled=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvgu.mma.sp.sync`` operation performs a warp-distributed MMA operation
   where operand A is "structured sparse". In this case, the ``matrixA`` operand
   represents the (warp-distributed) non-zero values of operand A, and the
   ``sparse_metadata`` operand provides the indices.

   The full description of the sparsity storage format and distribution scheme is
   described in the PTX docs. This operation is meant to follow the semantic
   described in the PTX documentation here:
   https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma

   The way the indices are distributed among the threads in a warp is controlled
   by the optional ``sparsity_selector`` operand, which is ``0`` by default. For
   more information, please consult the PTX documentation linked above.

   Example (targetingthe f16 16x8x32 ``mma.sp`` PTX instruction):

   .. code:: mlir

       nvgpu.mma.sp.sync (%a, %b, %c) metadata (%meta) {mmaShape = [16, 8, 32]} :
         (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.mma.sp.sync'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: matrixA() -> _ods_ir


   .. py:method:: matrixB() -> _ods_ir


   .. py:method:: matrixC() -> _ods_ir


   .. py:method:: sparseMetadata() -> _ods_ir


   .. py:method:: mmaShape() -> _ods_ir


   .. py:method:: sparsitySelector() -> _ods_ir


   .. py:method:: tf32Enabled() -> bool


   .. py:method:: res() -> _ods_ir


.. py:function:: mma_sp_sync(res, matrix_a, matrix_b, matrix_c, sparse_metadata, mma_shape, *, sparsity_selector=None, tf32_enabled=None, loc=None, ip=None) -> _ods_ir

.. py:class:: MmaSyncOp(res, matrixA, matrixB, matrixC, mmaShape, *, tf32Enabled=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvgpu.mma.sync`` op represents the warp-level matrix-multiply-and-
   accumulate (mma) operation that is compatible with ``nvvm.mma.sync``.
   The operands and results vector sizes are thread-level onwership to
   the warp-level mma operation shape. ``mmaShape`` attribute holds the
   warp-level matrix-multiply shape.

   The ``nvgpu.mma.sync`` op serves as an intermediate point between lowering from
   ``vector.contract`` to ``nvvm.mma.sync``.

   This operation is meant to follow the semantic of described here:
   https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma

   Example:

   .. code:: mlir

       %res = nvgpu.mma.sync (%matrixA, %matrixB, %matrixC) {mmaShape = [16, 8, 16]} :
           (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.mma.sync'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: matrixA() -> _ods_ir


   .. py:method:: matrixB() -> _ods_ir


   .. py:method:: matrixC() -> _ods_ir


   .. py:method:: mmaShape() -> _ods_ir


   .. py:method:: tf32Enabled() -> bool


   .. py:method:: res() -> _ods_ir


.. py:function:: mma_sync(res, matrix_a, matrix_b, matrix_c, mma_shape, *, tf32_enabled=None, loc=None, ip=None) -> _ods_ir

.. py:class:: RcpOp(in_, *, rounding=None, ftz=None, results=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Reciprocal calculation for ``vector`` types using ``nvvm.rcp`` OPs.

   Currently, only the ``approx`` rounding mode and ``ftz`` are supported, and only for the ``f32`` type.

   The input and output must be of the same vector type and shape.


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.rcp'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: in_() -> _ods_ir


   .. py:method:: rounding() -> _ods_ir


   .. py:method:: ftz() -> bool


   .. py:method:: out() -> _ods_ir


.. py:function:: rcp(in_, *, rounding=None, ftz=None, results=None, loc=None, ip=None) -> _ods_ir

.. py:class:: TmaAsyncLoadOp(dst, barriers, tensorMapDescriptor, coordinates, mbarId, *, multicastMask=None, predicate=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The Op loads a tile memory region from global memory to shared memory by
   Tensor Memory Access (TMA).

   ``$tensorMapDescriptor`` is tensor map descriptor which has information about
   tile shape. The descriptor is created by ``nvgpu.tma.create.descriptor``

   The Op uses ``$barrier`` mbarrier based completion mechanism.


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.tma.async.load'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: dst() -> _ods_ir


   .. py:method:: barriers() -> _ods_ir


   .. py:method:: tensorMapDescriptor() -> _ods_ir


   .. py:method:: coordinates() -> _ods_ir


   .. py:method:: mbarId() -> _ods_ir


   .. py:method:: multicastMask() -> Optional[_ods_ir]


   .. py:method:: predicate() -> Optional[_ods_ir]


.. py:function:: tma_async_load(dst, barriers, tensor_map_descriptor, coordinates, mbar_id, *, multicast_mask=None, predicate=None, loc=None, ip=None) -> TmaAsyncLoadOp

.. py:class:: TmaAsyncStoreOp(src, tensorMapDescriptor, coordinates, *, predicate=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The Op store a tile memory region from global memory to shared memory by
   Tensor Memory Access (TMA).

   ``$tensorMapDescriptor`` is tensor map descriptor which has information about
   tile shape. The descriptor is created by ``nvgpu.tma.create.descriptor``


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.tma.async.store'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: src() -> _ods_ir


   .. py:method:: tensorMapDescriptor() -> _ods_ir


   .. py:method:: coordinates() -> _ods_ir


   .. py:method:: predicate() -> Optional[_ods_ir]


.. py:function:: tma_async_store(src, tensor_map_descriptor, coordinates, *, predicate=None, loc=None, ip=None) -> TmaAsyncStoreOp

.. py:class:: TmaCreateDescriptorOp(tensorMap, tensor, boxDimensions, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The Op creates a tensor map descriptor object representing tiled memory
   region. To do that it calls CUDA Driver's ``cuTensorMapEncodeTiled``. The
   descriptor is used by Tensor Memory Access (TMA).

   The ``tensor`` is the source tensor to be tiled.

   The ``boxDimensions`` is the size of the tiled memory region in each dimension.

   For more information see below:
   https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.tma.create.descriptor'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: tensor() -> _ods_ir


   .. py:method:: boxDimensions() -> _ods_ir


   .. py:method:: tensorMap() -> _ods_ir


.. py:function:: tma_create_descriptor(tensor_map, tensor, box_dimensions, *, loc=None, ip=None) -> _ods_ir

.. py:class:: TmaFenceOp(tensorMapDescriptor, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The Op fences the given ``$tmaDescriptor``. This is necessary if the tensor map
   descriptor was modified from the host using cudaMemcpy. In this case, the
   kernel needs a fence after which it is safe to use ``tensor.map``.


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.tma.fence.descriptor'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: tensorMapDescriptor() -> _ods_ir


.. py:function:: tma_fence_descriptor(tensor_map_descriptor, *, loc=None, ip=None) -> TmaFenceOp

.. py:class:: TmaPrefetchOp(tensorMapDescriptor, *, predicate=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The Op brings the cache line containing the given ``$tmaDescriptor`` for
   subsequent use by the ``tma.async.load`` instruction.


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.tma.prefetch.descriptor'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: tensorMapDescriptor() -> _ods_ir


   .. py:method:: predicate() -> Optional[_ods_ir]


.. py:function:: tma_prefetch_descriptor(tensor_map_descriptor, *, predicate=None, loc=None, ip=None) -> TmaPrefetchOp

.. py:class:: WarpgroupGenerateDescriptorOp(descriptor, tensor, tensorMap, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op builds a ``nvgpu.warpgroup.descriptor`` that is used by
   ``nvgpu.warpgroup.mma`` to perform warpgroup-level matrix multiply and
   accumulate.

   The descriptor specifies the properties of the matrix in shared memory that
   is a multiplicand in the matrix multiply and accumulate operation.


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.warpgroup.generate.descriptor'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: tensor() -> _ods_ir


   .. py:method:: tensorMap() -> _ods_ir


   .. py:method:: descriptor() -> _ods_ir


.. py:function:: warpgroup_generate_descriptor(descriptor, tensor, tensor_map, *, loc=None, ip=None) -> _ods_ir

.. py:class:: WarpgroupMmaInitAccumulatorOp(matrixC, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op generates and initializes the accumulator matrix for
   ``nvgpu.warpgroup.mma`` op to perform matrix-multiply-and-accumulate.


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.warpgroup.mma.init.accumulator'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: matrixC() -> _ods_ir


.. py:function:: warpgroup_mma_init_accumulator(matrix_c, *, loc=None, ip=None) -> _ods_ir

.. py:class:: WarpgroupMmaOp(matrixD, descriptorA, descriptorB, matrixC, *, waitGroup=None, transposeA=None, transposeB=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvgpu.warpgroup.mma`` op performs the warpgroup-level (4 warps)
   matrix-multiply-and-accumulate (mma) operation that results in
   ``nvvm.wgmma.mma_async``.

   The operands are ``descriptorA`` and ``descriptorB`` that are wgmma matrix
   descriptors that shows the properties of the matrix in shared memory. The
   results are thread-level ownership to the warpgroup-level mma operation
   shape. The shape is deduced from the descriptor types and output vector.

   The Op encapsulates multiple ``nvvm.wgmma.mma_async`` operations to complete
   the given shape. As ``nvvm.wgmma.async`` Op, or its corresponding PTX
   instruction, is asynchronous, this Op groups the ``nvvm.wgmma.async`` and
   surrounds them between ``wgmma.fence.aligned`` and
   ``wgmma.commit.group.sync.aligned``, ``wgmma.wait.group.sync.aligned`` Ops.

   Example:

   .. code:: mlir

         %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2: 
                    !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, 
                    !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, 
                    !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
                    !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
                    -> 
                    !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
                    !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.warpgroup.mma'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: descriptorA() -> _ods_ir


   .. py:method:: descriptorB() -> _ods_ir


   .. py:method:: matrixC() -> _ods_ir


   .. py:method:: waitGroup() -> Optional[_ods_ir]


   .. py:method:: transposeA() -> bool


   .. py:method:: transposeB() -> bool


   .. py:method:: matrixD() -> _ods_ir


.. py:function:: warpgroup_mma(matrix_d, descriptor_a, descriptor_b, matrix_c, *, wait_group=None, transpose_a=None, transpose_b=None, loc=None, ip=None) -> _ods_ir

.. py:class:: WarpgroupMmaStoreOp(matrixD, dstMemref, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvgpu.warpgroup.mma.store`` op performs the store of fragmented result
   in $matrixD to given memref.

   [See the details of register fragment layout for accumulator matrix D]
   (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d)

   Note that, the op must be run with warp group.


   .. py:attribute:: OPERATION_NAME
      :value: 'nvgpu.warpgroup.mma.store'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: matrixD() -> _ods_ir


   .. py:method:: dstMemref() -> _ods_ir


.. py:function:: warpgroup_mma_store(matrix_d, dst_memref, *, loc=None, ip=None) -> WarpgroupMmaStoreOp