mlir.dialects._nvvm_ops_gen
===========================

.. py:module:: mlir.dialects._nvvm_ops_gen


Attributes
----------

.. autoapisummary::

   mlir.dialects._nvvm_ops_gen._ods_ir


Classes
-------

.. autoapisummary::

   mlir.dialects._nvvm_ops_gen._Dialect
   mlir.dialects._nvvm_ops_gen.Barrier0Op
   mlir.dialects._nvvm_ops_gen.BarrierArriveOp
   mlir.dialects._nvvm_ops_gen.BarrierOp
   mlir.dialects._nvvm_ops_gen.BlockDimXOp
   mlir.dialects._nvvm_ops_gen.BlockDimYOp
   mlir.dialects._nvvm_ops_gen.BlockDimZOp
   mlir.dialects._nvvm_ops_gen.BlockIdXOp
   mlir.dialects._nvvm_ops_gen.BlockIdYOp
   mlir.dialects._nvvm_ops_gen.BlockIdZOp
   mlir.dialects._nvvm_ops_gen.BlockInClusterIdXOp
   mlir.dialects._nvvm_ops_gen.BlockInClusterIdYOp
   mlir.dialects._nvvm_ops_gen.BlockInClusterIdZOp
   mlir.dialects._nvvm_ops_gen.Breakpoint
   mlir.dialects._nvvm_ops_gen.BulkStoreOp
   mlir.dialects._nvvm_ops_gen.Clock64Op
   mlir.dialects._nvvm_ops_gen.ClockOp
   mlir.dialects._nvvm_ops_gen.ClusterArriveOp
   mlir.dialects._nvvm_ops_gen.ClusterArriveRelaxedOp
   mlir.dialects._nvvm_ops_gen.ClusterDim
   mlir.dialects._nvvm_ops_gen.ClusterDimBlocksXOp
   mlir.dialects._nvvm_ops_gen.ClusterDimBlocksYOp
   mlir.dialects._nvvm_ops_gen.ClusterDimBlocksZOp
   mlir.dialects._nvvm_ops_gen.ClusterDimXOp
   mlir.dialects._nvvm_ops_gen.ClusterDimYOp
   mlir.dialects._nvvm_ops_gen.ClusterDimZOp
   mlir.dialects._nvvm_ops_gen.ClusterId
   mlir.dialects._nvvm_ops_gen.ClusterIdXOp
   mlir.dialects._nvvm_ops_gen.ClusterIdYOp
   mlir.dialects._nvvm_ops_gen.ClusterIdZOp
   mlir.dialects._nvvm_ops_gen.ClusterLaunchControlQueryCancelOp
   mlir.dialects._nvvm_ops_gen.ClusterLaunchControlTryCancelOp
   mlir.dialects._nvvm_ops_gen.ClusterWaitOp
   mlir.dialects._nvvm_ops_gen.ConvertBF16x2ToF8x2Op
   mlir.dialects._nvvm_ops_gen.ConvertF4x2ToF16x2Op
   mlir.dialects._nvvm_ops_gen.ConvertF6x2ToF16x2Op
   mlir.dialects._nvvm_ops_gen.ConvertF8x2ToBF16x2Op
   mlir.dialects._nvvm_ops_gen.ConvertF8x2ToF16x2Op
   mlir.dialects._nvvm_ops_gen.ConvertF16x2ToF8x2Op
   mlir.dialects._nvvm_ops_gen.ConvertF32x2ToBF16x2Op
   mlir.dialects._nvvm_ops_gen.ConvertF32x2ToF4x2Op
   mlir.dialects._nvvm_ops_gen.ConvertF32x2ToF6x2Op
   mlir.dialects._nvvm_ops_gen.ConvertF32x2ToF8x2Op
   mlir.dialects._nvvm_ops_gen.ConvertF32x2ToF16x2Op
   mlir.dialects._nvvm_ops_gen.ConvertF32x4ToF4x4Op
   mlir.dialects._nvvm_ops_gen.ConvertF32x4ToF6x4Op
   mlir.dialects._nvvm_ops_gen.ConvertF32x4ToF8x4Op
   mlir.dialects._nvvm_ops_gen.ConvertFloatToTF32Op
   mlir.dialects._nvvm_ops_gen.CpAsyncBulkCommitGroupOp
   mlir.dialects._nvvm_ops_gen.CpAsyncBulkGlobalToSharedClusterOp
   mlir.dialects._nvvm_ops_gen.CpAsyncBulkPrefetchOp
   mlir.dialects._nvvm_ops_gen.CpAsyncBulkSharedCTAToGlobalOp
   mlir.dialects._nvvm_ops_gen.CpAsyncBulkSharedCTAToSharedClusterOp
   mlir.dialects._nvvm_ops_gen.CpAsyncBulkTensorGlobalToSharedClusterOp
   mlir.dialects._nvvm_ops_gen.CpAsyncBulkTensorPrefetchOp
   mlir.dialects._nvvm_ops_gen.CpAsyncBulkTensorReduceOp
   mlir.dialects._nvvm_ops_gen.CpAsyncBulkTensorSharedCTAToGlobalOp
   mlir.dialects._nvvm_ops_gen.CpAsyncBulkWaitGroupOp
   mlir.dialects._nvvm_ops_gen.CpAsyncCommitGroupOp
   mlir.dialects._nvvm_ops_gen.CpAsyncMBarrierArriveOp
   mlir.dialects._nvvm_ops_gen.CpAsyncOp
   mlir.dialects._nvvm_ops_gen.CpAsyncWaitGroupOp
   mlir.dialects._nvvm_ops_gen.DotAccumulate2WayOp
   mlir.dialects._nvvm_ops_gen.DotAccumulate4WayOp
   mlir.dialects._nvvm_ops_gen.ElectSyncOp
   mlir.dialects._nvvm_ops_gen.EnvReg0Op
   mlir.dialects._nvvm_ops_gen.EnvReg1Op
   mlir.dialects._nvvm_ops_gen.EnvReg2Op
   mlir.dialects._nvvm_ops_gen.EnvReg3Op
   mlir.dialects._nvvm_ops_gen.EnvReg4Op
   mlir.dialects._nvvm_ops_gen.EnvReg5Op
   mlir.dialects._nvvm_ops_gen.EnvReg6Op
   mlir.dialects._nvvm_ops_gen.EnvReg7Op
   mlir.dialects._nvvm_ops_gen.EnvReg8Op
   mlir.dialects._nvvm_ops_gen.EnvReg9Op
   mlir.dialects._nvvm_ops_gen.EnvReg10Op
   mlir.dialects._nvvm_ops_gen.EnvReg11Op
   mlir.dialects._nvvm_ops_gen.EnvReg12Op
   mlir.dialects._nvvm_ops_gen.EnvReg13Op
   mlir.dialects._nvvm_ops_gen.EnvReg14Op
   mlir.dialects._nvvm_ops_gen.EnvReg15Op
   mlir.dialects._nvvm_ops_gen.EnvReg16Op
   mlir.dialects._nvvm_ops_gen.EnvReg17Op
   mlir.dialects._nvvm_ops_gen.EnvReg18Op
   mlir.dialects._nvvm_ops_gen.EnvReg19Op
   mlir.dialects._nvvm_ops_gen.EnvReg20Op
   mlir.dialects._nvvm_ops_gen.EnvReg21Op
   mlir.dialects._nvvm_ops_gen.EnvReg22Op
   mlir.dialects._nvvm_ops_gen.EnvReg23Op
   mlir.dialects._nvvm_ops_gen.EnvReg24Op
   mlir.dialects._nvvm_ops_gen.EnvReg25Op
   mlir.dialects._nvvm_ops_gen.EnvReg26Op
   mlir.dialects._nvvm_ops_gen.EnvReg27Op
   mlir.dialects._nvvm_ops_gen.EnvReg28Op
   mlir.dialects._nvvm_ops_gen.EnvReg29Op
   mlir.dialects._nvvm_ops_gen.EnvReg30Op
   mlir.dialects._nvvm_ops_gen.EnvReg31Op
   mlir.dialects._nvvm_ops_gen.Exit
   mlir.dialects._nvvm_ops_gen.FenceMbarrierInitOp
   mlir.dialects._nvvm_ops_gen.FenceProxyAcquireOp
   mlir.dialects._nvvm_ops_gen.FenceProxyOp
   mlir.dialects._nvvm_ops_gen.FenceProxyReleaseOp
   mlir.dialects._nvvm_ops_gen.FenceScClusterOp
   mlir.dialects._nvvm_ops_gen.GlobalTimerLoOp
   mlir.dialects._nvvm_ops_gen.GlobalTimerOp
   mlir.dialects._nvvm_ops_gen.GridDimXOp
   mlir.dialects._nvvm_ops_gen.GridDimYOp
   mlir.dialects._nvvm_ops_gen.GridDimZOp
   mlir.dialects._nvvm_ops_gen.GridIdOp
   mlir.dialects._nvvm_ops_gen.GriddepcontrolOp
   mlir.dialects._nvvm_ops_gen.InlinePtxOp
   mlir.dialects._nvvm_ops_gen.LaneIdOp
   mlir.dialects._nvvm_ops_gen.LaneMaskEqOp
   mlir.dialects._nvvm_ops_gen.LaneMaskGeOp
   mlir.dialects._nvvm_ops_gen.LaneMaskGtOp
   mlir.dialects._nvvm_ops_gen.LaneMaskLeOp
   mlir.dialects._nvvm_ops_gen.LaneMaskLtOp
   mlir.dialects._nvvm_ops_gen.LdMatrixOp
   mlir.dialects._nvvm_ops_gen.MBarrierArriveExpectTxOp
   mlir.dialects._nvvm_ops_gen.MBarrierArriveNocompleteOp
   mlir.dialects._nvvm_ops_gen.MBarrierArriveOp
   mlir.dialects._nvvm_ops_gen.MBarrierInitOp
   mlir.dialects._nvvm_ops_gen.MBarrierInvalOp
   mlir.dialects._nvvm_ops_gen.MBarrierTestWaitOp
   mlir.dialects._nvvm_ops_gen.MBarrierTryWaitParityOp
   mlir.dialects._nvvm_ops_gen.MapaOp
   mlir.dialects._nvvm_ops_gen.MatchSyncOp
   mlir.dialects._nvvm_ops_gen.MembarOp
   mlir.dialects._nvvm_ops_gen.MmaOp
   mlir.dialects._nvvm_ops_gen.NanosleepOp
   mlir.dialects._nvvm_ops_gen.PMEventOp
   mlir.dialects._nvvm_ops_gen.PrefetchOp
   mlir.dialects._nvvm_ops_gen.RcpApproxFtzF32Op
   mlir.dialects._nvvm_ops_gen.ReduxOp
   mlir.dialects._nvvm_ops_gen.SetMaxRegisterOp
   mlir.dialects._nvvm_ops_gen.ShflOp
   mlir.dialects._nvvm_ops_gen.SmDimOp
   mlir.dialects._nvvm_ops_gen.SmIdOp
   mlir.dialects._nvvm_ops_gen.StMatrixOp
   mlir.dialects._nvvm_ops_gen.SyncWarpOp
   mlir.dialects._nvvm_ops_gen.Tcgen05AllocOp
   mlir.dialects._nvvm_ops_gen.Tcgen05CommitOp
   mlir.dialects._nvvm_ops_gen.Tcgen05CpOp
   mlir.dialects._nvvm_ops_gen.Tcgen05DeallocOp
   mlir.dialects._nvvm_ops_gen.Tcgen05FenceOp
   mlir.dialects._nvvm_ops_gen.Tcgen05LdOp
   mlir.dialects._nvvm_ops_gen.Tcgen05MmaSmemDescOp
   mlir.dialects._nvvm_ops_gen.Tcgen05RelinquishAllocPermitOp
   mlir.dialects._nvvm_ops_gen.Tcgen05ShiftOp
   mlir.dialects._nvvm_ops_gen.Tcgen05StOp
   mlir.dialects._nvvm_ops_gen.Tcgen05WaitOp
   mlir.dialects._nvvm_ops_gen.ThreadIdXOp
   mlir.dialects._nvvm_ops_gen.ThreadIdYOp
   mlir.dialects._nvvm_ops_gen.ThreadIdZOp
   mlir.dialects._nvvm_ops_gen.VoteSyncOp
   mlir.dialects._nvvm_ops_gen.WMMALoadOp
   mlir.dialects._nvvm_ops_gen.WMMAMmaOp
   mlir.dialects._nvvm_ops_gen.WMMAStoreOp
   mlir.dialects._nvvm_ops_gen.WarpDimOp
   mlir.dialects._nvvm_ops_gen.WarpIdOp
   mlir.dialects._nvvm_ops_gen.WarpSizeOp
   mlir.dialects._nvvm_ops_gen.WgmmaFenceAlignedOp
   mlir.dialects._nvvm_ops_gen.WgmmaGroupSyncAlignedOp
   mlir.dialects._nvvm_ops_gen.WgmmaMmaAsyncOp
   mlir.dialects._nvvm_ops_gen.WgmmaWaitGroupSyncOp


Functions
---------

.. autoapisummary::

   mlir.dialects._nvvm_ops_gen.barrier0
   mlir.dialects._nvvm_ops_gen.barrier_arrive
   mlir.dialects._nvvm_ops_gen.barrier
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ntid_x
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ntid_y
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ntid_z
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ctaid_x
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ctaid_y
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ctaid_z
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_ctaid_x
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_ctaid_y
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_ctaid_z
   mlir.dialects._nvvm_ops_gen.breakpoint
   mlir.dialects._nvvm_ops_gen.st_bulk
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_clock64
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_clock
   mlir.dialects._nvvm_ops_gen.cluster_arrive
   mlir.dialects._nvvm_ops_gen.cluster_arrive_relaxed
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_nctarank
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_nctaid_x
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_nctaid_y
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_nctaid_z
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nclusterid_x
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nclusterid_y
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nclusterid_z
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_ctarank
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_clusterid_x
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_clusterid_y
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_clusterid_z
   mlir.dialects._nvvm_ops_gen.clusterlaunchcontrol_query_cancel
   mlir.dialects._nvvm_ops_gen.clusterlaunchcontrol_try_cancel
   mlir.dialects._nvvm_ops_gen.cluster_wait
   mlir.dialects._nvvm_ops_gen.convert_bf16x2_to_f8x2
   mlir.dialects._nvvm_ops_gen.convert_f4x2_to_f16x2
   mlir.dialects._nvvm_ops_gen.convert_f6x2_to_f16x2
   mlir.dialects._nvvm_ops_gen.convert_f8x2_to_bf16x2
   mlir.dialects._nvvm_ops_gen.convert_f8x2_to_f16x2
   mlir.dialects._nvvm_ops_gen.convert_f16x2_to_f8x2
   mlir.dialects._nvvm_ops_gen.convert_f32x2_to_bf16x2
   mlir.dialects._nvvm_ops_gen.convert_f32x2_to_f4x2
   mlir.dialects._nvvm_ops_gen.convert_f32x2_to_f6x2
   mlir.dialects._nvvm_ops_gen.convert_f32x2_to_f8x2
   mlir.dialects._nvvm_ops_gen.convert_f32x2_to_f16x2
   mlir.dialects._nvvm_ops_gen.convert_f32x4_to_f4x4
   mlir.dialects._nvvm_ops_gen.convert_f32x4_to_f6x4
   mlir.dialects._nvvm_ops_gen.convert_f32x4_to_f8x4
   mlir.dialects._nvvm_ops_gen.convert_float_to_tf32
   mlir.dialects._nvvm_ops_gen.cp_async_bulk_commit_group
   mlir.dialects._nvvm_ops_gen.cp_async_bulk_shared_cluster_global
   mlir.dialects._nvvm_ops_gen.cp_async_bulk_prefetch
   mlir.dialects._nvvm_ops_gen.cp_async_bulk_global_shared_cta
   mlir.dialects._nvvm_ops_gen.cp_async_bulk_shared_cluster_shared_cta
   mlir.dialects._nvvm_ops_gen.cp_async_bulk_tensor_shared_cluster_global
   mlir.dialects._nvvm_ops_gen.cp_async_bulk_tensor_prefetch
   mlir.dialects._nvvm_ops_gen.cp_async_bulk_tensor_reduce
   mlir.dialects._nvvm_ops_gen.cp_async_bulk_tensor_global_shared_cta
   mlir.dialects._nvvm_ops_gen.cp_async_bulk_wait_group
   mlir.dialects._nvvm_ops_gen.cp_async_commit_group
   mlir.dialects._nvvm_ops_gen.cp_async_mbarrier_arrive
   mlir.dialects._nvvm_ops_gen.cp_async_shared_global
   mlir.dialects._nvvm_ops_gen.cp_async_wait_group
   mlir.dialects._nvvm_ops_gen.dot_accumulate_2way
   mlir.dialects._nvvm_ops_gen.dot_accumulate_4way
   mlir.dialects._nvvm_ops_gen.elect_sync
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg0
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg1
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg2
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg3
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg4
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg5
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg6
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg7
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg8
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg9
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg10
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg11
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg12
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg13
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg14
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg15
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg16
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg17
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg18
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg19
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg20
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg21
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg22
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg23
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg24
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg25
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg26
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg27
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg28
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg29
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg30
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg31
   mlir.dialects._nvvm_ops_gen.exit
   mlir.dialects._nvvm_ops_gen.fence_mbarrier_init
   mlir.dialects._nvvm_ops_gen.fence_proxy_acquire
   mlir.dialects._nvvm_ops_gen.fence_proxy
   mlir.dialects._nvvm_ops_gen.fence_proxy_release
   mlir.dialects._nvvm_ops_gen.fence_sc_cluster
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_globaltimer_lo
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_globaltimer
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nctaid_x
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nctaid_y
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nctaid_z
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_gridid
   mlir.dialects._nvvm_ops_gen.griddepcontrol
   mlir.dialects._nvvm_ops_gen.inline_ptx
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_laneid
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_lanemask_eq
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_lanemask_ge
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_lanemask_gt
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_lanemask_le
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_lanemask_lt
   mlir.dialects._nvvm_ops_gen.ldmatrix
   mlir.dialects._nvvm_ops_gen.mbarrier_arrive_expect_tx
   mlir.dialects._nvvm_ops_gen.mbarrier_arrive_nocomplete
   mlir.dialects._nvvm_ops_gen.mbarrier_arrive
   mlir.dialects._nvvm_ops_gen.mbarrier_init
   mlir.dialects._nvvm_ops_gen.mbarrier_inval
   mlir.dialects._nvvm_ops_gen.mbarrier_test_wait
   mlir.dialects._nvvm_ops_gen.mbarrier_try_wait_parity
   mlir.dialects._nvvm_ops_gen.mapa
   mlir.dialects._nvvm_ops_gen.match_sync
   mlir.dialects._nvvm_ops_gen.memory_barrier
   mlir.dialects._nvvm_ops_gen.mma_sync
   mlir.dialects._nvvm_ops_gen.nanosleep
   mlir.dialects._nvvm_ops_gen.pmevent
   mlir.dialects._nvvm_ops_gen.prefetch
   mlir.dialects._nvvm_ops_gen.rcp_approx_ftz_f
   mlir.dialects._nvvm_ops_gen.redux_sync
   mlir.dialects._nvvm_ops_gen.setmaxregister
   mlir.dialects._nvvm_ops_gen.shfl_sync
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nsmid
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_smid
   mlir.dialects._nvvm_ops_gen.stmatrix
   mlir.dialects._nvvm_ops_gen.bar_warp_sync
   mlir.dialects._nvvm_ops_gen.tcgen05_alloc
   mlir.dialects._nvvm_ops_gen.tcgen05_commit
   mlir.dialects._nvvm_ops_gen.tcgen05_cp
   mlir.dialects._nvvm_ops_gen.tcgen05_dealloc
   mlir.dialects._nvvm_ops_gen.tcgen05_fence
   mlir.dialects._nvvm_ops_gen.tcgen05_ld
   mlir.dialects._nvvm_ops_gen.tcgen05_mma_smem_desc
   mlir.dialects._nvvm_ops_gen.tcgen05_relinquish_alloc_permit
   mlir.dialects._nvvm_ops_gen.tcgen05_shift
   mlir.dialects._nvvm_ops_gen.tcgen05_st
   mlir.dialects._nvvm_ops_gen.tcgen05_wait
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_tid_x
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_tid_y
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_tid_z
   mlir.dialects._nvvm_ops_gen.vote_sync
   mlir.dialects._nvvm_ops_gen.wmma_load
   mlir.dialects._nvvm_ops_gen.wmma_mma
   mlir.dialects._nvvm_ops_gen.wmma_store
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nwarpid
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_warpid
   mlir.dialects._nvvm_ops_gen.read_ptx_sreg_warpsize
   mlir.dialects._nvvm_ops_gen.wgmma_fence_aligned
   mlir.dialects._nvvm_ops_gen.wgmma_commit_group_sync_aligned
   mlir.dialects._nvvm_ops_gen.wgmma_mma_async
   mlir.dialects._nvvm_ops_gen.wgmma_wait_group_sync_aligned


Module Contents
---------------

.. py:data:: _ods_ir

.. py:class:: _Dialect(descriptor: object)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: DIALECT_NAMESPACE
      :value: 'nvvm'


.. py:class:: Barrier0Op(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.barrier0`` operation is a convenience operation that performs barrier
   synchronization and communication within a CTA (Cooperative Thread Array) using
   barrier ID 0. It is functionally equivalent to ``nvvm.barrier`` or ``nvvm.barrier id=0``.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.barrier0'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: barrier0(*, loc=None, ip=None) -> Barrier0Op

.. py:class:: BarrierArriveOp(numberOfThreads, *, barrierId=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Thread that executes this op announces their arrival at the barrier with
   given id and continue their execution.

   The default barrier id is 0 that is similar to ``nvvm.barrier`` Op. When
   ``barrierId`` is not present, the default barrier id is used.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.barrier.arrive'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: barrierId() -> Optional[_ods_ir]


   .. py:method:: numberOfThreads() -> _ods_ir


.. py:function:: barrier_arrive(number_of_threads, *, barrier_id=None, loc=None, ip=None) -> BarrierArriveOp

.. py:class:: BarrierOp(res, *, barrierId=None, numberOfThreads=None, reductionOp=None, reductionPredicate=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.barrier`` operation performs barrier synchronization and communication
   within a CTA (Cooperative Thread Array). It causes executing threads to wait for
   all non-exited threads participating in the barrier to arrive.

   The operation takes two optional operands:

   * ``barrierId``: Specifies a logical barrier resource with value 0 through 15.
   Each CTA instance has sixteen barriers numbered 0..15. Defaults to 0 if not specified.
   * ``numberOfThreads``: Specifies the number of threads participating in the barrier.
   When specified, the value must be a multiple of the warp size. If not specified,
   all threads in the CTA participate in the barrier.
   * ``reductionOp``: specifies the reduction operation (``popc``, ``and``, ``or``).
   * ``reductionPredicate``: specifies the predicate to be used with the
   ``reductionOp``.

   The barrier operation guarantees that when the barrier completes, prior memory
   accesses requested by participating threads are performed relative to all threads
   participating in the barrier. It also ensures that no new memory access is
   requested by participating threads before the barrier completes.

   When a barrier completes, the waiting threads are restarted without delay, and
   the barrier is reinitialized so that it can be immediately reused.

   This operation generates an aligned barrier, indicating that all threads in the CTA
   will execute the same barrier instruction. Behavior is undefined if all threads in the
   CTA do not reach this instruction.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.barrier'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS
      :value: [0, 0, 0]


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: barrierId() -> Optional[_ods_ir]


   .. py:method:: numberOfThreads() -> Optional[_ods_ir]


   .. py:method:: reductionPredicate() -> Optional[_ods_ir]


   .. py:method:: reductionOp() -> Optional[_ods_ir]


   .. py:method:: res() -> Optional[_ods_ir]


.. py:function:: barrier(res, *, barrier_id=None, number_of_threads=None, reduction_op=None, reduction_predicate=None, loc=None, ip=None) -> Union[_ods_ir, _ods_ir, BarrierOp]

.. py:class:: BlockDimXOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.ntid.x'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_ntid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: BlockDimYOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.ntid.y'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_ntid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: BlockDimZOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.ntid.z'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_ntid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: BlockIdXOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.ctaid.x'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_ctaid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: BlockIdYOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.ctaid.y'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_ctaid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: BlockIdZOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.ctaid.z'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_ctaid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: BlockInClusterIdXOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.cluster.ctaid.x'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_cluster_ctaid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: BlockInClusterIdYOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.cluster.ctaid.y'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_cluster_ctaid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: BlockInClusterIdZOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.cluster.ctaid.z'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_cluster_ctaid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: Breakpoint(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Breakpoint suspends execution of the program for debugging.
   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.breakpoint'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: breakpoint(*, loc=None, ip=None) -> Breakpoint

.. py:class:: BulkStoreOp(addr, size, *, initVal=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Initializes a region of shared memory at the address given by ``addr``.
   The ``size`` operand specifies the number of bytes to initialize and must be
   a multiple of 8.
   The ``initVal`` operand specifies the value to initialize the memory to. The
   only supported value is 0.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.st.bulk'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: size() -> _ods_ir


   .. py:method:: initVal() -> _ods_ir


.. py:function:: st_bulk(addr, size, *, init_val=None, loc=None, ip=None) -> BulkStoreOp

.. py:class:: Clock64Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.clock64'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_clock64(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: ClockOp(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.clock'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_clock(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterArriveOp(*, aligned=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``cluster.arrive`` can be used by the threads within the cluster for synchronization and
   communication. The ``cluster.arrive`` instruction marks the warps' arrival at the barrier
   without causing the executing thread to wait for other participating threads.

   The ``aligned`` attribute, when provided, generates the .aligned version of the PTX instruction.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cluster.arrive'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: aligned() -> bool


.. py:function:: cluster_arrive(*, aligned=None, loc=None, ip=None) -> ClusterArriveOp

.. py:class:: ClusterArriveRelaxedOp(*, aligned=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``cluster.arrive`` can be used by the threads within the cluster for synchronization and
   communication. The ``cluster.arrive`` instruction marks the warps' arrival at the barrier
   without causing the executing thread to wait for other participating threads.

   The ``aligned`` attribute, when provided, generates the .aligned version of the PTX instruction.
   The .relaxed qualifier on ``cluster.arrive`` specifies that there are no memory
   ordering and visibility guarantees provided for the memory accesses performed prior to
   ``cluster.arrive``.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cluster.arrive.relaxed'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: aligned() -> bool


.. py:function:: cluster_arrive_relaxed(*, aligned=None, loc=None, ip=None) -> ClusterArriveRelaxedOp

.. py:class:: ClusterDim(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.cluster.nctarank'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_cluster_nctarank(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterDimBlocksXOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.cluster.nctaid.x'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_cluster_nctaid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterDimBlocksYOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.cluster.nctaid.y'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_cluster_nctaid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterDimBlocksZOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.cluster.nctaid.z'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_cluster_nctaid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterDimXOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.nclusterid.x'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_nclusterid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterDimYOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.nclusterid.y'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_nclusterid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterDimZOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.nclusterid.z'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_nclusterid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterId(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.cluster.ctarank'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_cluster_ctarank(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterIdXOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.clusterid.x'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_clusterid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterIdYOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.clusterid.y'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_clusterid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterIdZOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.clusterid.z'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_clusterid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterLaunchControlQueryCancelOp(res, query_type, try_cancel_response, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   ``clusterlaunchcontrol.query.cancel`` queries the response of a
   ``clusterlaunchcontrol.try.cancel`` operation specified by operand
   ``try_cancel_response``.

   Operand ``query_type`` specifies the type of query to perform and can be one
   of the following:

   * ``is_canceled`` : Returns true if the try cancel request succeeded,
   and false otherwise.
   * ``get_first_cta_id_{x/y/z}`` : Returns the x, y, or z coordinate of the
   first CTA in the canceled cluster. Behaviour is defined only if the try
   cancel request succeeded.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-clusterlaunchcontrol-query-cancel>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.clusterlaunchcontrol.query.cancel'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: try_cancel_response() -> _ods_ir


   .. py:method:: query_type() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: clusterlaunchcontrol_query_cancel(res, query_type, try_cancel_response, *, loc=None, ip=None) -> _ods_ir

.. py:class:: ClusterLaunchControlTryCancelOp(smemAddress, mbarrier, *, multicast=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   ``clusterlaunchcontrol.try.cancel`` requests atomically canceling the launch
   of a cluster that has not started running yet. It asynchronously writes an
   opaque response to shared memory indicating whether the operation succeeded
   or failed.

   Operand ``smemAddress`` specifies the naturally aligned address of the
   16-byte wide shared memory location where the request's response is written.

   Operand ``mbarrier`` specifies the mbarrier object used to track the
   completion of the asynchronous operation.

   If ``multicast`` is specified, the response is asynchronously written to the
   corresponding local shared memory location (specifed by ``addr``) of each CTA
   in the requesting cluster.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-clusterlaunchcontrol-try-cancel>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.clusterlaunchcontrol.try.cancel'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: smemAddress() -> _ods_ir


   .. py:method:: mbarrier() -> _ods_ir


   .. py:method:: multicast() -> bool


.. py:function:: clusterlaunchcontrol_try_cancel(smem_address, mbarrier, *, multicast=None, loc=None, ip=None) -> ClusterLaunchControlTryCancelOp

.. py:class:: ClusterWaitOp(*, aligned=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``cluster.wait`` causes the executing thread to wait for all non-exited threads
   of the cluster to perform ``cluster.arrive``. The ``aligned`` attribute, when provided,
   generates the .aligned version of the PTX instruction.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cluster.wait'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: aligned() -> bool


.. py:function:: cluster_wait(*, aligned=None, loc=None, ip=None) -> ClusterWaitOp

.. py:class:: ConvertBF16x2ToF8x2Op(dst, a, dstTy, *, rnd=None, sat=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op converts the given bf16 inputs in a bf16x2 vector to the specified
   f8 type.
   The result ``dst`` is represented as an i16 type or as a vector
   of two i8 types.
   If ``dst`` is returned as an i16 type, the converted values from ``a``
   are packed such that the value converted from the first element of ``a``
   is stored in the upper 8 bits of ``dst`` and the value converted from the
   second element of ``a`` is stored in the lower 8 bits of ``dst``.
   If ``dst`` is returned as a vector type, each converted value is stored as an
   i8 element in the vector.
   The ``rnd`` and ``sat`` attributes specify the rounding and saturation modes
   respectively.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.bf16x2.to.f8x2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: a() -> _ods_ir


   .. py:method:: rnd() -> _ods_ir


   .. py:method:: sat() -> _ods_ir


   .. py:method:: dstTy() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_bf16x2_to_f8x2(dst, a, dst_ty, *, rnd=None, sat=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF4x2ToF16x2Op(dst, src, srcType, *, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op converts the given f4 inputs in a packed i8 to f16.

   The result ``dst`` is represented as a vector of f16 elements.
   The ``relu`` attribute, when set, lowers to the '.relu' variant of
   the cvt instruction."

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f4x2.to.f16x2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: src() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: srcType() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f4x2_to_f16x2(dst, src, src_type, *, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF6x2ToF16x2Op(dst, src, srcType, *, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op converts the given f6 inputs in a i8x2 vector to f16.

   The result ``dst`` is represented as a vector of f16 elements.
   The ``relu`` attribute, when set, lowers to the '.relu' variant of
   the cvt instruction."

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f6x2.to.f16x2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: src() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: srcType() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f6x2_to_f16x2(dst, src, src_type, *, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF8x2ToBF16x2Op(dst, src, srcType, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op converts the given f8 inputs in a i8x2 vector to bf16.

   The result ``dst`` is represented as a vector of bf16 elements.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f8x2.to.bf16x2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: src() -> _ods_ir


   .. py:method:: srcType() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f8x2_to_bf16x2(dst, src, src_type, *, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF8x2ToF16x2Op(dst, src, srcType, *, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op converts the given f8 inputs in a i8x2 vector to f16.

   The result ``dst`` is represented as a vector of f16 elements.
   The ``relu`` attribute, when set, lowers to the '.relu' variant of
   the cvt instruction."

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f8x2.to.f16x2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: src() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: srcType() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f8x2_to_f16x2(dst, src, src_type, *, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF16x2ToF8x2Op(dst, a, dstTy, *, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op converts the given f16 inputs in an f16x2 vector to the specified
   f8 type.
   The result ``dst`` is represented as an i16 type or as a vector
   of two i8 types.
   If ``dst`` is returned as an i16 type, the converted values from ``a``
   are packed such that the value converted from the first element of ``a``
   is stored in the upper 8 bits of ``dst`` and the value converted from the
   second element of ``a`` is stored in the lower 8 bits of ``dst``.
   If ``dst`` is returned as a vector type, each converted value is stored as an
   i8 element in the vector.
   The ``relu`` attribute, when set, lowers to the '.relu' variant of
   the cvt instruction.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f16x2.to.f8x2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: a() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: dstTy() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f16x2_to_f8x2(dst, a, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF32x2ToBF16x2Op(dst, src_hi, src_lo, rbits, *, rnd=None, sat=None, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Converts two F32 values to packed bf16x2 format using stochastic
   rounding (.rs) mode with randomness provided by the ``rbits`` parameter. The
   ``relu`` attribute clamps negative results to 0. The ``sat`` attribute determines
   saturation behavior. The ``src_hi`` and ``src_lo`` parameters correspond to operands
   ``a`` and ``b`` in the PTX ISA, respectively.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f32x2.to.bf16x2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: src_hi() -> _ods_ir


   .. py:method:: src_lo() -> _ods_ir


   .. py:method:: rbits() -> _ods_ir


   .. py:method:: rnd() -> _ods_ir


   .. py:method:: sat() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f32x2_to_bf16x2(dst, src_hi, src_lo, rbits, *, rnd=None, sat=None, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF32x2ToF4x2Op(dst, a, b, dstTy, *, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op converts each of the given float inputs to the specified fp4 type.
   The result ``dst`` is returned as an i8 type where the converted values are
   packed such that the value converted from ``a`` is stored in the upper 4 bits
   of ``dst`` and the value converted from ``b`` is stored in the lower 4 bits of
   ``dst``.
   The ``relu`` attribute, when set, lowers to the '.relu' variant of
   the cvt instruction.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f32x2.to.f4x2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: a() -> _ods_ir


   .. py:method:: b() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: dstTy() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f32x2_to_f4x2(dst, a, b, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF32x2ToF6x2Op(dst, a, b, dstTy, *, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op converts each of the given float inputs to the specified fp6 type.
   The result ``dst`` is represented either as an i16 type or as a vector
   of two i8 types.
   If ``dst`` is returned as an i16 type, the converted values are packed such
   that the value converted from ``a`` is stored in the upper 8 bits of ``dst``
   with 2 MSB bits padded with zeros and the value converted from ``b`` is
   stored in the lower 8 bits of ``dst`` with 2 MSB bits padded with zeros.
   If ``dst`` is returned as a vector type, each converted value is stored as an
   i8 element in the vector.
   The ``relu`` attribute, when set, lowers to the '.relu' variant of
   the cvt instruction.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f32x2.to.f6x2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: a() -> _ods_ir


   .. py:method:: b() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: dstTy() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f32x2_to_f6x2(dst, a, b, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF32x2ToF8x2Op(dst, a, b, dstTy, *, rnd=None, sat=None, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op converts each of the given float inputs to the specified fp8 type.
   The result ``dst`` is represented as an i16 type or as a vector
   of two i8 types.
   If ``dst`` is returned as an i16 type, the converted values are packed such
   that the value converted from ``a`` is stored in the upper 8 bits of ``dst``
   and the value converted from ``b`` is stored in the lower 8 bits of ``dst``.
   If ``dst`` is returned as a vector type, each converted value is stored as an
   i8 element in the vector.
   The ``rnd`` and ``sat`` attributes specify the rounding and saturation modes respectively.
   The ``relu`` attribute, when set, lowers to the '.relu' variant of
   the cvt instruction.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f32x2.to.f8x2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: a() -> _ods_ir


   .. py:method:: b() -> _ods_ir


   .. py:method:: rnd() -> _ods_ir


   .. py:method:: sat() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: dstTy() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f32x2_to_f8x2(dst, a, b, dst_ty, *, rnd=None, sat=None, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF32x2ToF16x2Op(dst, src_hi, src_lo, rbits, *, rnd=None, sat=None, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Converts two F32 values to packed f16x2 format using stochastic
   rounding (.rs) mode with randomness provided by the ``rbits`` parameter. The
   ``relu`` attribute clamps negative results to 0. The ``sat`` attribute determines
   saturation behavior. The ``src_hi`` and ``src_lo`` parameters correspond to operands
   ``a`` and ``b`` in the PTX ISA, respectively.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f32x2.to.f16x2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: src_hi() -> _ods_ir


   .. py:method:: src_lo() -> _ods_ir


   .. py:method:: rbits() -> _ods_ir


   .. py:method:: rnd() -> _ods_ir


   .. py:method:: sat() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f32x2_to_f16x2(dst, src_hi, src_lo, rbits, *, rnd=None, sat=None, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF32x4ToF4x4Op(dst, src, rbits, dstTy, *, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Converts a vector<4xf32> to packed f4x4 format using
   stochastic rounding (.rs) mode with SATFINITE saturation. Randomness is
   provided by the ``rbits`` parameter. The ``dstTy`` attribute specifies the
   target floating-point format. The ``relu`` attribute clamps negative results to 0.

   Note: These operations always use RS rounding mode and SATFINITE saturation mode.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f32x4.to.f4x4'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: src() -> _ods_ir


   .. py:method:: rbits() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: dstTy() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f32x4_to_f4x4(dst, src, rbits, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF32x4ToF6x4Op(dst, src, rbits, dstTy, *, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Converts a vector<4xf32> to packed f6x4 format using
   stochastic rounding (.rs) mode with SATFINITE saturation. Randomness is
   provided by the ``rbits`` parameter. The ``dstTy`` attribute specifies the
   target floating-point format. The ``relu`` attribute clamps negative results to 0.

   Note: These operations always use RS rounding mode and SATFINITE saturation mode.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f32x4.to.f6x4'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: src() -> _ods_ir


   .. py:method:: rbits() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: dstTy() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f32x4_to_f6x4(dst, src, rbits, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertF32x4ToF8x4Op(dst, src, rbits, dstTy, *, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Converts a vector<4xf32> to packed f8x4 format using
   stochastic rounding (.rs) mode with SATFINITE saturation. Randomness is
   provided by the ``rbits`` parameter. The ``dstTy`` attribute specifies the
   target floating-point format. The ``relu`` attribute clamps negative results to 0.

   Note: These operations always use RS rounding mode and SATFINITE saturation mode.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.f32x4.to.f8x4'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: src() -> _ods_ir


   .. py:method:: rbits() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: dstTy() -> _ods_ir


   .. py:method:: dst() -> _ods_ir


.. py:function:: convert_f32x4_to_f8x4(dst, src, rbits, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ConvertFloatToTF32Op(res, src, *, rnd=None, sat=None, relu=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op converts the given f32 input to tf32.
   The result ``res`` is represented as an i32 type.
   The ``relu`` attribute, when set, lowers to the '.relu' variant of
   the cvt instruction. The ``rnd`` and ``sat`` attributes specify the
   the rounding and saturation modes respectively.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.convert.float.to.tf32'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: src() -> _ods_ir


   .. py:method:: rnd() -> _ods_ir


   .. py:method:: sat() -> _ods_ir


   .. py:method:: relu() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: convert_float_to_tf32(res, src, *, rnd=None, sat=None, relu=None, loc=None, ip=None) -> _ods_ir

.. py:class:: CpAsyncBulkCommitGroupOp(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This Op commits all prior initiated but uncommitted cp.async.bulk
   instructions into a cp.async.bulk-group.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.bulk.commit.group'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: cp_async_bulk_commit_group(*, loc=None, ip=None) -> CpAsyncBulkCommitGroupOp

.. py:class:: CpAsyncBulkGlobalToSharedClusterOp(dstMem, srcMem, mbar, size, *, multicastMask=None, l2CacheHint=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Initiates an asynchronous copy operation from global memory to cluster's
   shared memory.

   The ``multicastMask`` operand is optional. When it is present, the Op copies
   data from global memory to shared memory of multiple CTAs in the cluster.
   Operand ``multicastMask`` specifies the destination CTAs in the cluster such
   that each bit position in the 16-bit ``multicastMask`` operand corresponds to
   the ``nvvm.read.ptx.sreg.ctaid`` of the destination CTA.

   The ``l2CacheHint`` operand is optional, and it is used to specify cache
   eviction policy that may be used during the memory access.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.bulk.shared.cluster.global'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS
      :value: [1, 1, 1, 1, 0, 0]


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: dstMem() -> _ods_ir


   .. py:method:: srcMem() -> _ods_ir


   .. py:method:: mbar() -> _ods_ir


   .. py:method:: size() -> _ods_ir


   .. py:method:: multicastMask() -> Optional[_ods_ir]


   .. py:method:: l2CacheHint() -> Optional[_ods_ir]


.. py:function:: cp_async_bulk_shared_cluster_global(dst_mem, src_mem, mbar, size, *, multicast_mask=None, l2_cache_hint=None, loc=None, ip=None) -> CpAsyncBulkGlobalToSharedClusterOp

.. py:class:: CpAsyncBulkPrefetchOp(srcMem, size, *, l2CacheHint=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Initiates an asynchronous prefetch of data from the location
   specified by ``srcMem`` to the L2 cache.

   The ``l2CacheHint`` operand is optional, and it is used to specify cache
   eviction policy that may be used during the memory access.

   Example:

   .. code:: mlir

         nvvm.cp.async.bulk.prefetch %src, %size : !llvm.ptr<1>
       
         // with l2_cache_hint
         nvvm.cp.async.bulk.prefetch %src, %size l2_cache_hint = %ch : !llvm.ptr<1>

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.bulk.prefetch'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: srcMem() -> _ods_ir


   .. py:method:: size() -> _ods_ir


   .. py:method:: l2CacheHint() -> Optional[_ods_ir]


.. py:function:: cp_async_bulk_prefetch(src_mem, size, *, l2_cache_hint=None, loc=None, ip=None) -> CpAsyncBulkPrefetchOp

.. py:class:: CpAsyncBulkSharedCTAToGlobalOp(dstMem, srcMem, size, *, l2CacheHint=None, byteMask=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Initiates an asynchronous copy operation from Shared CTA memory to
   global memory. The 32-bit operand ``size`` specifies the amount of
   memory to be copied, in terms of number of bytes. ``size`` must be a
   multiple of 16. The ``l2CacheHint`` operand is optional, and it is used
   to specify cache eviction policy that may be used during the memory
   access. The ``byteMask`` operand is optional. The i-th bit in the 16-bit
   wide ``byteMask`` specifies whether the i-th byte of each 16-byte wide
   chunk of source data is copied to the destination. If the bit is set,
   the byte is copied.

   Example:

   .. code:: mlir

         nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size
             : !llvm.ptr<1>, !llvm.ptr<3>
       
         // with l2_cache_hint
         nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size l2_cache_hint = %ch
             : !llvm.ptr<1>, !llvm.ptr<3>
       
         // with byte_mask
         nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size byte_mask = %mask
             : !llvm.ptr<1>, !llvm.ptr<3>
       
         // with both l2_cache_hint and byte_mask
         nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size l2_cache_hint = %ch byte_mask = %mask
             : !llvm.ptr<1>, !llvm.ptr<3>

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.bulk.global.shared.cta'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS
      :value: [1, 1, 1, 0, 0]


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: dstMem() -> _ods_ir


   .. py:method:: srcMem() -> _ods_ir


   .. py:method:: size() -> _ods_ir


   .. py:method:: l2CacheHint() -> Optional[_ods_ir]


   .. py:method:: byteMask() -> Optional[_ods_ir]


.. py:function:: cp_async_bulk_global_shared_cta(dst_mem, src_mem, size, *, l2_cache_hint=None, byte_mask=None, loc=None, ip=None) -> CpAsyncBulkSharedCTAToGlobalOp

.. py:class:: CpAsyncBulkSharedCTAToSharedClusterOp(dstMem, srcMem, mbar, size, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Initiates an asynchronous copy operation from Shared CTA memory to Shared
   cluster memory.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.bulk.shared.cluster.shared.cta'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: dstMem() -> _ods_ir


   .. py:method:: srcMem() -> _ods_ir


   .. py:method:: mbar() -> _ods_ir


   .. py:method:: size() -> _ods_ir


.. py:function:: cp_async_bulk_shared_cluster_shared_cta(dst_mem, src_mem, mbar, size, *, loc=None, ip=None) -> CpAsyncBulkSharedCTAToSharedClusterOp

.. py:class:: CpAsyncBulkTensorGlobalToSharedClusterOp(dstMem, tmaDescriptor, coordinates, mbar, im2colOffsets, *, multicastMask=None, l2CacheHint=None, mode=None, isCTAOnly=None, group=None, predicate=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Initiates an asynchronous copy operation on the tensor data from global
   memory to shared::cluster (or) shared::cta memory. This Op supports all
   the load modes specified in ``TMALoadMode``.

   The ``multicastMask`` operand is optional. When it is present, the Op copies
   data from global memory to shared memory of multiple CTAs in the cluster.
   Operand ``multicastMask`` specifies the destination CTAs in the cluster such
   that each bit position in the 16-bit ``multicastMask`` operand corresponds to
   the ``nvvm.read.ptx.sreg.ctaid`` of the destination CTA.

   The ``l2CacheHint`` operand is optional, and it is used to specify cache
   eviction policy that may be used during the memory access.

   When the ``isCTAOnly`` attribute is set to true, the destination is
   shared::cta only. Hence, ``multicastMask`` and ``CTAGroup`` are not applicable
   when ``isCTAOnly`` is true.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.bulk.tensor.shared.cluster.global'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: dstMem() -> _ods_ir


   .. py:method:: tmaDescriptor() -> _ods_ir


   .. py:method:: coordinates() -> _ods_ir


   .. py:method:: mbar() -> _ods_ir


   .. py:method:: im2colOffsets() -> _ods_ir


   .. py:method:: multicastMask() -> Optional[_ods_ir]


   .. py:method:: l2CacheHint() -> Optional[_ods_ir]


   .. py:method:: predicate() -> Optional[_ods_ir]


   .. py:method:: mode() -> _ods_ir


   .. py:method:: isCTAOnly() -> _ods_ir


   .. py:method:: group() -> Optional[_ods_ir]


.. py:function:: cp_async_bulk_tensor_shared_cluster_global(dst_mem, tma_descriptor, coordinates, mbar, im2col_offsets, *, multicast_mask=None, l2_cache_hint=None, mode=None, is_cta_only=None, group=None, predicate=None, loc=None, ip=None) -> CpAsyncBulkTensorGlobalToSharedClusterOp

.. py:class:: CpAsyncBulkTensorPrefetchOp(tmaDescriptor, coordinates, im2colOffsets, *, mode=None, l2CacheHint=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Initiates an asynchronous prefetch operation on the tensor data from global
   memory to L2 cache. This Op supports all the load modes specified in
   ``TMALoadMode``.

   The ``l2CacheHint`` operand is optional, and it is used to specify cache
   eviction policy that may be used during the memory access.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.bulk.tensor.prefetch'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: tmaDescriptor() -> _ods_ir


   .. py:method:: coordinates() -> _ods_ir


   .. py:method:: im2colOffsets() -> _ods_ir


   .. py:method:: l2CacheHint() -> Optional[_ods_ir]


   .. py:method:: mode() -> _ods_ir


.. py:function:: cp_async_bulk_tensor_prefetch(tma_descriptor, coordinates, im2col_offsets, *, mode=None, l2_cache_hint=None, loc=None, ip=None) -> CpAsyncBulkTensorPrefetchOp

.. py:class:: CpAsyncBulkTensorReduceOp(tmaDescriptor, srcMem, redKind, coordinates, *, mode=None, l2CacheHint=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Initiates an asynchronous reduction operation of tensor data in
   global memory with tensor data in shared memory.

   The ``mode`` attribute indicates whether the copy mode is tile or im2col.
   The ``redOp`` attribute specifies the reduction operations applied.
   The supported reduction operations are:
   {add, min, max, inc, dec, and, or, xor}

   The ``l2CacheHint`` operand is optional, and it is used to specify cache
   eviction policy that may be used during the memory access.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.bulk.tensor.reduce'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: tmaDescriptor() -> _ods_ir


   .. py:method:: srcMem() -> _ods_ir


   .. py:method:: coordinates() -> _ods_ir


   .. py:method:: l2CacheHint() -> Optional[_ods_ir]


   .. py:method:: redKind() -> _ods_ir


   .. py:method:: mode() -> _ods_ir


.. py:function:: cp_async_bulk_tensor_reduce(tma_descriptor, src_mem, red_kind, coordinates, *, mode=None, l2_cache_hint=None, loc=None, ip=None) -> CpAsyncBulkTensorReduceOp

.. py:class:: CpAsyncBulkTensorSharedCTAToGlobalOp(tmaDescriptor, srcMem, coordinates, *, l2CacheHint=None, mode=None, predicate=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Initiates an asynchronous copy of the tensor data from shared::cta
   memory to global memory. This Op supports all the store modes specified in
   ``TMAStoreMode``.

   The ``l2CacheHint`` operand is optional, and it is used to specify cache
   eviction policy that may be used during the memory access.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.bulk.tensor.global.shared.cta'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: tmaDescriptor() -> _ods_ir


   .. py:method:: srcMem() -> _ods_ir


   .. py:method:: coordinates() -> _ods_ir


   .. py:method:: l2CacheHint() -> Optional[_ods_ir]


   .. py:method:: predicate() -> Optional[_ods_ir]


   .. py:method:: mode() -> _ods_ir


.. py:function:: cp_async_bulk_tensor_global_shared_cta(tma_descriptor, src_mem, coordinates, *, l2_cache_hint=None, mode=None, predicate=None, loc=None, ip=None) -> CpAsyncBulkTensorSharedCTAToGlobalOp

.. py:class:: CpAsyncBulkWaitGroupOp(group, *, read=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Op waits for completion of the most recent bulk async-groups.

   The ``$group`` operand tells waiting has to be done until for $group or fewer
   of the most recent bulk async-groups. If ``$group`` is 0, the op wait until
   all the most recent bulk async-groups have completed.

   The ``$read`` indicates that the waiting has to be done until all the bulk
   async operations in the specified bulk async-group have completed reading
   from their source locations.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.bulk.wait_group'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: group() -> _ods_ir


   .. py:method:: read() -> bool


.. py:function:: cp_async_bulk_wait_group(group, *, read=None, loc=None, ip=None) -> CpAsyncBulkWaitGroupOp

.. py:class:: CpAsyncCommitGroupOp(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.commit.group'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: cp_async_commit_group(*, loc=None, ip=None) -> CpAsyncCommitGroupOp

.. py:class:: CpAsyncMBarrierArriveOp(addr, *, noinc=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``cp.async.mbarrier.arrive`` Op makes the *mbarrier object* track
   all prior cp.async operations initiated by the executing thread.
   The ``addr`` operand specifies the address of the *mbarrier object*
   in generic or shared::cta address space. When it is generic, the
   underlying memory should fall within the shared::cta space;
   otherwise the behavior is undefined. The ``noinc`` attr impacts
   how the mbarrier's state is updated.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.mbarrier.arrive'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: noinc() -> _ods_ir


.. py:function:: cp_async_mbarrier_arrive(addr, *, noinc=None, loc=None, ip=None) -> CpAsyncMBarrierArriveOp

.. py:class:: CpAsyncOp(dst, src, size, modifier, *, cpSize=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.shared.global'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: dst() -> _ods_ir


   .. py:method:: src() -> _ods_ir


   .. py:method:: cpSize() -> Optional[_ods_ir]


   .. py:method:: size() -> _ods_ir


   .. py:method:: modifier() -> _ods_ir


.. py:function:: cp_async_shared_global(dst, src, size, modifier, *, cp_size=None, loc=None, ip=None) -> CpAsyncOp

.. py:class:: CpAsyncWaitGroupOp(n, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.cp.async.wait.group'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: n() -> _ods_ir


.. py:function:: cp_async_wait_group(n, *, loc=None, ip=None) -> CpAsyncWaitGroupOp

.. py:class:: DotAccumulate2WayOp(res, a, a_type, b, b_type, c, b_hi, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Performs a two-way 16-bit to 8-bit dot-product which is accumulated in a
   32-bit result.
   Operand ``a`` is a vector of two 16-bit elements and operand ``b`` a vector
   of four 8-bit elements between which the dot product is computed.

   The ``a_type`` and ``b_type`` attributes specify the type of the elements in ``a``
   and ``b`` respectively.
   If ``a_type`` or ``b_type`` is ``s``, then the elements in the corresponding
   vector are sign-extended to 32-bit before the dot product is computed.
   If ``a_type`` or ``b_type`` is ``u``, then the elements in the corresponding
   vector are zero-extended to 32-bit instead.

   The ``b_hi`` boolean attribute specifies which two bytes of ``b`` are used for
   the dot product. If ``b_hi`` is true, then the dot product is computed
   between  ``a`` and elements at indices 2 and 3 of ``b``. If ``b_hi`` is false,
   then the dot product is computed between ``a`` and elements at indices 0 and
   1 of ``b``.

   Operand ``c`` is a 32-bit integer to which the result is accumulated. It is
   treated as holding a signed integer if any of ``a_type`` or ``b_type`` is
   signed.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-dp2a>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.dot.accumulate.2way'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: a() -> _ods_ir


   .. py:method:: b() -> _ods_ir


   .. py:method:: c() -> _ods_ir


   .. py:method:: a_type() -> _ods_ir


   .. py:method:: b_type() -> _ods_ir


   .. py:method:: b_hi() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: dot_accumulate_2way(res, a, a_type, b, b_type, c, b_hi, *, loc=None, ip=None) -> _ods_ir

.. py:class:: DotAccumulate4WayOp(res, a, a_type, b, b_type, c, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Performs a four-way byte dot-product which is accumulated in a 32-bit
   result.
   Operand ``a`` and ``b`` are vectors of 4 bytes between which the dot product is
   computed.

   The ``a_type`` and ``b_type`` attributes specify the type of the elements in ``a``
   and ``b`` respectively.
   If ``a_type`` or ``b_type`` is ``signed``, then the elements in the corresponding
   vector are sign-extended to 32-bit before the dot product is computed.
   If ``a_type`` or ``b_type`` is ``unsigned``, then the elements in the
   corresponding vector are zero-extended to 32-bit instead.

   Operand ``c`` is a 32-bit integer to which the result is accumulated. It is
   treated as holding a signed integer if any of ``a_type`` or ``b_type`` is ``s8``.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-dp4a>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.dot.accumulate.4way'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: a() -> _ods_ir


   .. py:method:: b() -> _ods_ir


   .. py:method:: c() -> _ods_ir


   .. py:method:: a_type() -> _ods_ir


   .. py:method:: b_type() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: dot_accumulate_4way(res, a, a_type, b, b_type, c, *, loc=None, ip=None) -> _ods_ir

.. py:class:: ElectSyncOp(pred, *, membermask=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``elect.sync`` instruction elects one predicated active leader
   thread from among a set of threads specified in the ``membermask``.
   When the ``membermask`` is not provided explicitly, a default value
   of ``0xFFFFFFFF`` is used. The predicate result is set to ``True`` for
   the leader thread, and ``False`` for all other threads.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.elect.sync'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: membermask() -> Optional[_ods_ir]


   .. py:method:: pred() -> _ods_ir


.. py:function:: elect_sync(pred, *, membermask=None, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg0Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg0'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg0(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg1Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg1'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg1(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg2Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg2'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg2(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg3Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg3'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg3(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg4Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg4'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg4(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg5Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg5'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg5(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg6Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg6'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg6(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg7Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg7'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg7(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg8Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg8'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg8(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg9Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg9'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg9(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg10Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg10'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg10(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg11Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg11'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg11(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg12Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg12'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg12(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg13Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg13'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg13(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg14Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg14'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg14(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg15Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg15'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg15(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg16Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg16'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg16(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg17Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg17'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg17(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg18Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg18'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg18(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg19Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg19'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg19(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg20Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg20'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg20(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg21Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg21'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg21(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg22Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg22'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg22(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg23Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg23'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg23(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg24Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg24'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg24(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg25Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg25'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg25(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg26Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg26'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg26(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg27Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg27'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg27(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg28Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg28'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg28(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg29Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg29'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg29(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg30Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg30'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg30(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: EnvReg31Op(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.envreg31'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_envreg31(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: Exit(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Ends execution of a thread.
   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.exit'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: exit(*, loc=None, ip=None) -> Exit

.. py:class:: FenceMbarrierInitOp(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Fence operation that applies on the prior nvvm.mbarrier.init

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.fence.mbarrier.init'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: fence_mbarrier_init(*, loc=None, ip=None) -> FenceMbarrierInitOp

.. py:class:: FenceProxyAcquireOp(scope, addr, size, *, fromProxy=None, toProxy=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   ``fence.proxy.acquire`` is a uni-directional fence used to establish ordering
   between a prior memory access performed via the generic proxy and a
   subsequent memory access performed via the tensormap proxy

   The address operand ``addr`` and the operand ``size`` together specify the
   memory range ``[addr, addr+size)`` on which the ordering guarantees on the
   memory accesses across the proxies is to be provided. The only supported
   value for the ``size`` operand is 128 and must be an immediate. Generic Addressing
   is used unconditionally, and the address specified by the operand ``addr`` must
   fall within the ``.global`` state space. Otherwise, the behavior is undefined

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.fence.proxy.acquire'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: size() -> _ods_ir


   .. py:method:: scope() -> _ods_ir


   .. py:method:: fromProxy() -> _ods_ir


   .. py:method:: toProxy() -> _ods_ir


.. py:function:: fence_proxy_acquire(scope, addr, size, *, from_proxy=None, to_proxy=None, loc=None, ip=None) -> FenceProxyAcquireOp

.. py:class:: FenceProxyOp(kind, *, space=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Fence operation with proxy to establish an ordering between memory accesses
   that may happen through different proxies.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.fence.proxy'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: kind() -> _ods_ir


   .. py:method:: space() -> Optional[_ods_ir]


.. py:function:: fence_proxy(kind, *, space=None, loc=None, ip=None) -> FenceProxyOp

.. py:class:: FenceProxyReleaseOp(scope, *, fromProxy=None, toProxy=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   ``fence.proxy.release`` is a uni-directional fence used to establish ordering
   between a prior memory access performed via the generic proxy and a
   subsequent memory access performed via the tensormap proxy. ``fence.proxy.release``
   operation can form a release sequence that synchronizes with an acquire
   sequence that contains the fence.proxy.acquire proxy fence operation

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.fence.proxy.release'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: scope() -> _ods_ir


   .. py:method:: fromProxy() -> _ods_ir


   .. py:method:: toProxy() -> _ods_ir


.. py:function:: fence_proxy_release(scope, *, from_proxy=None, to_proxy=None, loc=None, ip=None) -> FenceProxyReleaseOp

.. py:class:: FenceScClusterOp(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.fence.sc.cluster'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: fence_sc_cluster(*, loc=None, ip=None) -> FenceScClusterOp

.. py:class:: GlobalTimerLoOp(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.globaltimer.lo'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_globaltimer_lo(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: GlobalTimerOp(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.globaltimer'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_globaltimer(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: GridDimXOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.nctaid.x'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_nctaid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: GridDimYOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.nctaid.y'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_nctaid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: GridDimZOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.nctaid.z'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_nctaid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: GridIdOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.gridid'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_gridid(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: GriddepcontrolOp(kind, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   If the $kind attribute is set to ``wait``, it causes the
   executing thread to wait until all prerequisite grids in flight
   have completed and all the memory operations from the prerequisite grids
   are performed and made visible to the current grid.

   When the $kind is launch_dependents, it signals that specific dependents
   the runtime system designated to react to this instruction can be scheduled
   as soon as all other CTAs in the grid issue the same instruction or have
   completed.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-griddepcontrol>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.griddepcontrol'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: kind() -> _ods_ir


.. py:function:: griddepcontrol(kind, *, loc=None, ip=None) -> GriddepcontrolOp

.. py:class:: InlinePtxOp(writeOnlyArgs, readOnlyArgs, readWriteArgs, ptxCode, *, predicate=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   This op allows using PTX directly within the NVVM
   dialect, while greatly simplifying llvm.inline_asm generation. It
   automatically handles register size selection and sets the correct
   read/write access for each operand. The operation leverages the
   ``BasicPtxBuilderInterface`` to abstract away low-level details of
   PTX assembly formatting.

   .. code::

       The `predicate` attribute is used to specify a predicate for the 
       PTX instruction.
       
       Example 1: Read-only Parameters
       ```mlir
       nvvm.inline_ptx "mbarrier.init.b64 [$0], $1;" (%barrier_gen, %count) : !llvm.ptr, i32
       
       // Lowers to:
       llvm.inline_asm has_side_effects asm_dialect = att 
         "mbarrier.init.b64 [$0], $1;", "l,r" %arg0, %arg2 : (!llvm.ptr, i32) -> ()
       ```
       
       Example 2: Read-only and Write-only Parameters
       ```mlir
       %0 = nvvm.inline_ptx "ex2.approx.ftz.f32 $0, $1;" (%input) : f32 -> f32
       
       // Lowers to:
       %0 = llvm.inline_asm has_side_effects asm_dialect = att 
         "ex2.approx.ftz.f32 $0, $1;", "=f,f" %arg0 : (f32) -> f32
       ```
       
       Example 3: Predicate Usage
       ```mlir
       nvvm.inline_ptx "mbarrier.init.b64 [$0], $1;" (%barrier_gen, %count), 
         predicate = %pred : !llvm.ptr, i32, i1
       
       // Lowers to:
       llvm.inline_asm has_side_effects asm_dialect = att 
         "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" %arg0, %arg2, %arg3 
         : (!llvm.ptr, i32, i1) -> ()
       ```


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.inline_ptx'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: readOnlyArgs() -> _ods_ir


   .. py:method:: readWriteArgs() -> _ods_ir


   .. py:method:: predicate() -> Optional[_ods_ir]


   .. py:method:: ptxCode() -> _ods_ir


   .. py:method:: writeOnlyArgs() -> _ods_ir


.. py:function:: inline_ptx(write_only_args, read_only_args, read_write_args, ptx_code, *, predicate=None, loc=None, ip=None) -> Union[_ods_ir, _ods_ir, InlinePtxOp]

.. py:class:: LaneIdOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.laneid'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_laneid(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: LaneMaskEqOp(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.lanemask.eq'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_lanemask_eq(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: LaneMaskGeOp(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.lanemask.ge'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_lanemask_ge(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: LaneMaskGtOp(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.lanemask.gt'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_lanemask_gt(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: LaneMaskLeOp(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.lanemask.le'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_lanemask_le(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: LaneMaskLtOp(res, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.lanemask.lt'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_lanemask_lt(res, *, loc=None, ip=None) -> _ods_ir

.. py:class:: LdMatrixOp(res, ptr, num, layout, shape, eltType, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.ldmatrix'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: ptr() -> _ods_ir


   .. py:method:: num() -> _ods_ir


   .. py:method:: layout() -> _ods_ir


   .. py:method:: shape() -> _ods_ir


   .. py:method:: eltType() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: ldmatrix(res, ptr, num, layout, shape, elt_type, *, loc=None, ip=None) -> _ods_ir

.. py:class:: MBarrierArriveExpectTxOp(addr, txcount, *, predicate=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.mbarrier.arrive.expect_tx`` operation performs an expect-tx operation
   followed by an arrive-on operation on the *mbarrier object*. Uses the default
   ``.release.cta`` semantics. This release pattern establishes memory ordering for
   operations occurring in program order before this arrive instruction by making
   operations from the current thread visible to subsequent operations in other
   threads within the CTA. When other threads perform corresponding acquire operations
   (like 'mbarrier.test.wait'), they synchronize with this release pattern.

   This operation first performs an expect-tx operation with the specified transaction
   count, then performs an arrive-on operation with an implicit count of 1. The
   expect-tx operation increases the tx-count of the *mbarrier object* by the specified
   expectCount value, setting the current phase to expect and tracks the completion
   of additional asynchronous transactions.

   The operation takes the following operands:

   * ``addr``: A pointer to the memory location of the *mbarrier object*. Uses generic
   addressing, but the address must still be in the shared memory space.
   * ``txcount``: An unsigned integer specifying the expected transaction count
   for the expect-tx operation. This represents the number of asynchronous transactions
   expected to complete before the barrier phase completes.
   * ``predicate``: Optional predicate for conditional execution.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.mbarrier.arrive.expect_tx'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: txcount() -> _ods_ir


   .. py:method:: predicate() -> Optional[_ods_ir]


.. py:function:: mbarrier_arrive_expect_tx(addr, txcount, *, predicate=None, loc=None, ip=None) -> MBarrierArriveExpectTxOp

.. py:class:: MBarrierArriveNocompleteOp(res, addr, count, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.mbarrier.arrive.nocomplete`` operation performs an arrive-on operation
   on the *mbarrier object* with the guarantee that it will not cause the barrier to
   complete its current phase. Uses the default ``.release.cta`` semantics. This release
   pattern establishes memory ordering for operations occurring in program order before
   this arrive instruction by making operations from the current thread visible to
   subsequent operations in other threads within the CTA. When other threads perform
   corresponding acquire operations (like 'mbarrier.test.wait'), they synchronize with
   this release pattern.

   This operation causes the executing thread to signal its arrival at the barrier
   with a specified count, but ensures that the barrier phase will not complete as
   a result of this operation. The operation returns an opaque value that
   captures the phase of the *mbarrier object* prior to the arrive-on operation.

   The operation takes the following operands:

   * ``addr``: A pointer to the memory location of the *mbarrier object*. The ``addr``
   must be a pointer to generic or shared::cta memory. When it is generic, the
   underlying address must be within the shared::cta memory space; otherwise
   the behavior is undefined.
   * ``count``: Integer specifying the count argument to the arrive-on operation.
   Must be in the valid range as specified in the *mbarrier object* contents.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.mbarrier.arrive.nocomplete'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: count() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: mbarrier_arrive_nocomplete(res, addr, count, *, loc=None, ip=None) -> _ods_ir

.. py:class:: MBarrierArriveOp(res, addr, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.mbarrier.arrive`` operation performs an arrive-on operation on the
   *mbarrier object* at the specified address. Uses the default ``.release.cta`` semantics.
   This release pattern establishes memory ordering for operations occurring in program
   order before this arrive instruction by making operations from the current thread
   visible to subsequent operations in other threads within the CTA. When other threads
   perform corresponding acquire operations (like 'mbarrier.test.wait'), they synchronize
   with this release pattern.

   This operation causes the executing thread to signal its arrival at the barrier.
   The operation returns an opaque value that captures the phase of the
   *mbarrier object* prior to the arrive-on operation. The contents of this state
   value are implementation-specific.

   The operation takes the following operand:

   * ``addr``: A pointer to the memory location of the *mbarrier object*. The ``addr``
   must be a pointer to generic or shared::cta memory. When it is generic, the
   underlying address must be within the shared::cta memory space; otherwise
   the behavior is undefined.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.mbarrier.arrive'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: mbarrier_arrive(res, addr, *, loc=None, ip=None) -> _ods_ir

.. py:class:: MBarrierInitOp(addr, count, *, predicate=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.mbarrier.init`` operation initializes an *mbarrier object* at the specified
   memory location.

   This operation initializes the *mbarrier object* with the following state:

   * Current phase: 0
   * Expected arrival count: ``count``
   * Pending arrival count: ``count``
   * Transaction count (tx-count): 0

   The operation takes the following operands:

   * ``addr``: A pointer to the memory location of the *mbarrier object*. The ``addr``
   must be a pointer to generic or shared::cta memory. When it is generic, the
   underlying address must be within the shared::cta memory space; otherwise
   the behavior is undefined.
   * ``count``: Integer specifying the number of threads that will participate in barrier
   synchronization. Must be in the range [1, 2²⁰ - 1].
   * ``predicate``: Optional predicate for conditional execution.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.mbarrier.init'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: count() -> _ods_ir


   .. py:method:: predicate() -> Optional[_ods_ir]


.. py:function:: mbarrier_init(addr, count, *, predicate=None, loc=None, ip=None) -> MBarrierInitOp

.. py:class:: MBarrierInvalOp(addr, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.mbarrier.inval`` operation invalidates an *mbarrier object* at the
   specified memory location.

   This operation marks the *mbarrier object* as invalid, making it safe to repurpose
   the memory location for other uses or to reinitialize it as a new *mbarrier object*.
   It is undefined behavior if the *mbarrier object* is already invalid.

   The operation takes the following operand:

   * ``addr``: A pointer to the memory location of the *mbarrier object*. The ``addr``
   must be a pointer to generic or shared::cta memory. When it is generic, the
   underlying address must be within the shared::cta memory space; otherwise
   the behavior is undefined.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.mbarrier.inval'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


.. py:function:: mbarrier_inval(addr, *, loc=None, ip=None) -> MBarrierInvalOp

.. py:class:: MBarrierTestWaitOp(res, addr, state, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.mbarrier.test.wait`` operation performs a non-blocking test for the
   completion of a specific phase of an *mbarrier object*. It uses the default
   ``.acquire.cta`` semantics. This acquire pattern establishes memory ordering for
   operations occurring in program order after this wait instruction by making
   operations from other threads in the CTA visible to subsequent operations in the current
   thread. When this wait completes, it synchronizes with the corresponding release
   pattern from the ``mbarrier.arrive`` operation, establishing memory ordering within
   the CTA.

   This operation tests whether the mbarrier phase specified by the state operand
   has completed. It is a non-blocking instruction that immediately returns the
   completion status without suspending the executing thread.

   The operation takes the following operands:

   * ``addr``: A pointer to the memory location of the *mbarrier object*. Uses generic
   addressing, but the address must still be in the shared memory space.
   * ``state``: An opaque value returned by a previous ``mbarrier.arrive``
   operation on the same *mbarrier object* during the current or immediately
   preceding phase.

   The operation returns a boolean value indicating whether the specified phase
   has completed:

   * ``true``: The immediately preceding phase has completed
   * ``false``: The phase is still incomplete (current phase)

   **Memory ordering guarantees**: When this wait returns true, the following
   ordering guarantees hold:

   #. All memory accesses (except async operations) requested prior to
   ``mbarrier.arrive`` having release semantics by participating CTA threads
   are visible to the executing thread.
   #. All ``cp.async`` operations requested prior to ``cp.async.mbarrier.arrive``
   by participating CTA threads are visible to the executing thread.
   #. All ``cp.async.bulk`` operations using the same *mbarrier object* requested
   prior to ``mbarrier.arrive`` having release semantics by participating CTA
   threads are visible to the executing thread.
   #. Memory accesses requested after this wait are not visible to memory
   accesses performed prior to ``mbarrier.arrive`` by other participating
   threads.
   #. No ordering guarantee exists for memory accesses by the same thread
   between ``mbarrier.arrive`` and this wait.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.mbarrier.test.wait'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: state() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: mbarrier_test_wait(res, addr, state, *, loc=None, ip=None) -> _ods_ir

.. py:class:: MBarrierTryWaitParityOp(addr, phase, ticks, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.mbarrier.try_wait.parity`` operation performs a potentially-blocking
   test for the completion of a specific phase of an *mbarrier object* using phase
   parity. It uses the default ``.acquire.cta`` semantics. This acquire pattern
   establishes memory ordering for operations occurring in program order after this
   wait instruction by making operations from other threads in the CTA visible to subsequent
   operations in the current thread. When this wait completes, it synchronizes with
   the corresponding release pattern from the ``mbarrier.arrive`` operation, establishing
   memory ordering within the CTA.

   This operation waits for the completion of the mbarrier phase indicated by the
   phase parity. While it uses the underlying PTX ``mbarrier.try_wait.parity``
   instruction, this MLIR operation generates a loop that enforces the test to
   complete before continuing execution, ensuring the barrier phase is actually
   completed rather than potentially timing out.

   The operation takes the following operands:

   * ``addr``: A pointer to the memory location of the *mbarrier object*. Uses generic
   addressing, but the address must still be in the shared memory space.
   * ``phase``: An integer specifying the phase parity (0 or 1). Even phases
   have parity 0, odd phases have parity 1.
   * ``ticks``: An unsigned integer specifying the suspend time hint in
   nanoseconds. This may be used instead of the system-dependent time limit.

   **Memory ordering guarantees**: When this wait returns true, the following
   ordering guarantees hold:

   #. All memory accesses (except async operations) requested prior to
   ``mbarrier.arrive`` having release semantics by participating CTA threads
   are visible to the executing thread.
   #. All ``cp.async`` operations requested prior to ``cp.async.mbarrier.arrive``
   by participating CTA threads are visible to the executing thread.
   #. All ``cp.async.bulk`` operations using the same *mbarrier object* requested
   prior to ``mbarrier.arrive`` having release semantics by participating CTA
   threads are visible to the executing thread.
   #. Memory accesses requested after this wait are not visible to memory
   accesses performed prior to ``mbarrier.arrive`` by other participating
   threads.
   #. No ordering guarantee exists for memory accesses by the same thread
   between ``mbarrier.arrive`` and this wait.

   **Implementation behavior**:
   This operation generates a PTX loop that repeatedly calls the underlying
   ``mbarrier.try_wait.parity`` instruction until the barrier phase completes.
   Unlike the raw PTX instruction which may return without completion after a
   timeout, this MLIR operation guarantees completion by continuing to loop until
   the specified phase is reached.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.mbarrier.try_wait.parity'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: phase() -> _ods_ir


   .. py:method:: ticks() -> _ods_ir


.. py:function:: mbarrier_try_wait_parity(addr, phase, ticks, *, loc=None, ip=None) -> MBarrierTryWaitParityOp

.. py:class:: MapaOp(res, a, b, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.mapa'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: a() -> _ods_ir


   .. py:method:: b() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: mapa(res, a, b, *, loc=None, ip=None) -> _ods_ir

.. py:class:: MatchSyncOp(res, thread_mask, val, kind, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``match.sync`` op performs broadcast and compare of operand ``val`` across
   all non-exited threads in ``thread_mask`` and returns a mask depending on the
   kind and an optional predicate.

   The matching operation kinds are:

   * ``any``: Returns a mask corresponding to the non-exited threads in the
   ``thread_mask`` that have the same value of operand ``val``.
   * ``all``: Returns a mask and a predicate. If all non-exited threads in the
   ``thread_mask`` have the same value of operand ``val``, the predicate is set to
   true and the mask corresponds to the non-exited threads in the
   ``thread_mask``. Otherwise, the predicate is set to false and the mask is 0.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-match-sync>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.match.sync'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: thread_mask() -> _ods_ir


   .. py:method:: val() -> _ods_ir


   .. py:method:: kind() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: match_sync(res, thread_mask, val, kind, *, loc=None, ip=None) -> _ods_ir

.. py:class:: MembarOp(scope, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   ``membar`` operation guarantees that prior memory accesses requested by this
   thread are performed at the specified ``scope``, before later memory
   operations requested by this thread following the membar instruction.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.memory.barrier'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: scope() -> _ods_ir


.. py:function:: memory_barrier(scope, *, loc=None, ip=None) -> MembarOp

.. py:class:: MmaOp(res, shape, layoutA, layoutB, operandA, operandB, operandC, *, b1Op=None, intOverflowBehavior=None, multiplicandAPtxType=None, multiplicandBPtxType=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.mma.sync`` operation collectively performs the operation
   ``D = matmul(A, B) + C`` using all threads in a warp.

   All the threads in the warp must execute the same ``mma.sync`` operation.

   For each possible multiplicand PTX data type, there are one or more possible
   instruction shapes given as "mMnNkK". The below table describes the posssibilities
   as well as the types required for the operands. Note that the data type for
   C (the accumulator) and D (the result) can vary independently when there are
   multiple possibilities in the "C/D Type" column.

   When an optional attribute cannot be immediately inferred from the types of
   the operands and the result during parsing or validation, an error will be
   raised.

   ``b1Op`` is only relevant when the binary (b1) type is given to
   ``multiplicandDataType``. It specifies how the multiply-and-acumulate is
   performed and is either ``xor_popc`` or ``and_poc``. The default is ``xor_popc``.

   ``intOverflowBehavior`` is only relevant when the ``multiplicandType`` attribute
   is one of ``u8, s8, u4, s4``, this attribute describes how overflow is handled
   in the accumulator. When the attribute is ``satfinite``, the accumulator values
   are clamped in the int32 range on overflow. This is the default behavior.
   Alternatively, accumulator behavior ``wrapped`` can also be specified, in
   which case overflow wraps from one end of the range to the other.

   ``layoutA`` and ``layoutB`` are required and should generally be set to
   ``#nvvm.mma_layout<row>`` and ``#nvvm.mma_layout<col>`` respectively, but other
   combinations are possible for certain layouts according to the table below.

   .. code::

       | A/B Type | Shape     | ALayout | BLayout | A Type   | B Type   | C/D Type          |
       |----------|-----------|---------|---------|----------|----------|-------------------|
       | f64      | .m8n8k4   | row     | col     | 1x f64   | 1x f64   | 2x f64            |
       | f16      | .m8n8k4   | row/col | row/col | 2x f16x2 | 2x f16x2 | 4x f16x2 or 8xf32 |
       |          | .m16n8k8  | row     | col     | 2x f16x2 | 1x f16x2 | 2x f16x2 or 4 f32 |
       |          | .m16n8k16 | row     | col     | 4x f16x2 | 2x f16x2 | 2x f16x2 or 4 f32 |
       | bf16     | .m16n8k8  | row     | col     | 2x i32   | 1x i32   | 4x f32            |
       |          | .m16n8k16 | row     | col     | 4x i32   | 2x i32   | 4x f32            |
       | tf32     | .m16n8k4  | row     | col     | 2x i32   | 1x i32   | 4x f32            |
       |          | .m16n8k8  | row     | col     | 4x i32   | 2x i32   | 2x f16x2 or 4 f32 |
       | u8/s8    | .m8n8k16  | row     | col     | 1x i32   | 1x i32   | 2x i32            |
       |          | .m16n8k16 | row     | col     | 2x i32   | 1x i32   | 4x i32            |
       |          | .m16n8k32 | row     | col     | 4x i32   | 2x i32   | 4x i32            |
       | u4/s4    | .m8n8k32  | row     | col     | 1x i32   | 1x i32   | 2x i32            |
       |          | m16n8k32  | row     | col     | 2x i32   | 1x i32   | 4x i32            |
       |          | m16n8k64  | row     | col     | 4x i32   | 2x i32   | 4x i32            |
       | b1       | m8n8k128  | row     | col     | 1x i32   | 1x i32   | 2x i32            |
       |          | m16n8k128 | row     | col     | 2x i32   | 1x i32   | 4x i32            |

   Example:

   .. code:: mlir

       
       %128 = nvvm.mma.sync A[%120, %121, %122, %123]
                            B[%124, %125]
                            C[%126, %127]
                            {layoutA = #nvvm.mma_layout<row>,
                             layoutB = #nvvm.mma_layout<col>,
                             shape = {k = 16 : i32, m = 16 : i32, n = 8 : i32}}
           : (vector<2xf16>, vector<2xf16>, vector<2xf16>)
              -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.mma.sync'


   .. py:attribute:: _ODS_OPERAND_SEGMENTS


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: operandA() -> _ods_ir


   .. py:method:: operandB() -> _ods_ir


   .. py:method:: operandC() -> _ods_ir


   .. py:method:: shape() -> _ods_ir


   .. py:method:: b1Op() -> Optional[_ods_ir]


   .. py:method:: intOverflowBehavior() -> Optional[_ods_ir]


   .. py:method:: layoutA() -> _ods_ir


   .. py:method:: layoutB() -> _ods_ir


   .. py:method:: multiplicandAPtxType() -> Optional[_ods_ir]


   .. py:method:: multiplicandBPtxType() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: mma_sync(res, shape, layout_a, layout_b, operand_a, operand_b, operand_c, *, b1_op=None, int_overflow_behavior=None, multiplicand_a_ptx_type=None, multiplicand_b_ptx_type=None, loc=None, ip=None) -> _ods_ir

.. py:class:: NanosleepOp(duration, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The op suspends the thread for a sleep duration approximately close to the
   delay ``$duration``, specified in nanoseconds.

   The sleep duration is approximated, but guaranteed to be in the
   interval [0, 2*t]. The maximum sleep duration is 1 millisecond.
   The implementation may reduce the sleep duration for individual threads
   within a warp such that all sleeping threads in the warp wake up together.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.nanosleep'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: duration() -> _ods_ir


.. py:function:: nanosleep(duration, *, loc=None, ip=None) -> NanosleepOp

.. py:class:: PMEventOp(*, maskedEventId=None, eventId=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Triggers one or more of a fixed number of performance monitor events, with
   event index or mask specified by immediate operand.

   Without ``mask`` it triggers a single performance monitor event indexed by
   immediate operand a, in the range 0..15.

   With ``mask`` it triggers one or more of the performance monitor events. Each
   bit in the 16-bit immediate operand controls an event.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.pmevent'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: maskedEventId() -> Optional[_ods_ir]


   .. py:method:: eventId() -> Optional[_ods_ir]


.. py:function:: pmevent(*, masked_event_id=None, event_id=None, loc=None, ip=None) -> PMEventOp

.. py:class:: PrefetchOp(addr, *, cacheLevel=None, evictPriority=None, predicate=None, tensormap=None, uniform=None, in_param_space=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Prefetches the cache line containing the address given by ``addr``. The
   operand may be a global, local, or generic pointer. When ``tensormap`` is
   specified, the operand may instead be a constant or generic pointer. If the
   address maps to shared memory, the operation has no effect.

   At most one of ``cacheLevel`` or ``tensormap`` may be present. The ``cacheLevel``
   attribute selects the target cache level. When combined with ``uniform``, the
   prefetch is performed to the uniform cache, in which case ``addr`` must be a
   generic pointer.

   When ``tensormap`` is used, the line containing ``addr`` is brought from the
   constant or parameter state space for later use by ``cp.async.bulk.tensor``.
   If ``in_param_space`` is specified, the generic pointer is interpreted as
   referring to the parameter state space.

   ``uniform`` can be specified after the ``cacheLevel`` to indicate that the
   prefetch is performed to the specified uniform cache level. If ``uniform`` is
   specified, ``addr`` must be a generic address pointer and no operation is
   performed if ``addr`` maps to a ``const``, ``local``, or ``shared`` memory location.

   The ``evictPriority`` attribute is optional and specifies the cache eviction
   priority when ``cacheLevel`` is L2.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.prefetch'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: predicate() -> Optional[_ods_ir]


   .. py:method:: cacheLevel() -> Optional[_ods_ir]


   .. py:method:: evictPriority() -> Optional[_ods_ir]


   .. py:method:: tensormap() -> bool


   .. py:method:: uniform() -> bool


   .. py:method:: in_param_space() -> bool


.. py:function:: prefetch(addr, *, cache_level=None, evict_priority=None, predicate=None, tensormap=None, uniform=None, in_param_space=None, loc=None, ip=None) -> PrefetchOp

.. py:class:: RcpApproxFtzF32Op(res, arg, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.rcp.approx.ftz.f'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: arg() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: rcp_approx_ftz_f(res, arg, *, loc=None, ip=None) -> _ods_ir

.. py:class:: ReduxOp(res, val, kind, mask_and_clamp, *, abs=None, nan=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   ``redux.sync`` performs a reduction operation ``kind`` of the 32 bit source
   register across all non-exited threads in the membermask.

   The ``abs`` and ``nan`` attributes can be used in the case of f32 input type,
   where the ``abs`` attribute causes the absolute value of the input to be used
   in the reduction operation, and the ``nan`` attribute causes the reduction
   operation to return NaN if any of the inputs to participating threads are
   NaN.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.redux.sync'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: val() -> _ods_ir


   .. py:method:: mask_and_clamp() -> _ods_ir


   .. py:method:: kind() -> _ods_ir


   .. py:method:: abs() -> _ods_ir


   .. py:method:: nan() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: redux_sync(res, val, kind, mask_and_clamp, *, abs=None, nan=None, loc=None, ip=None) -> _ods_ir

.. py:class:: SetMaxRegisterOp(regCount, action, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.setmaxregister'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: regCount() -> _ods_ir


   .. py:method:: action() -> _ods_ir


.. py:function:: setmaxregister(reg_count, action, *, loc=None, ip=None) -> SetMaxRegisterOp

.. py:class:: ShflOp(res, thread_mask, val, offset, mask_and_clamp, kind, *, return_value_and_is_valid=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``shfl.sync`` Op implements data shuffle within threads of a warp.
   The ``thread_mask`` denotes the threads participating in the Op where
   the bit position corresponds to a particular thread's laneid.
   The ``offset`` specifies a source lane or source lane offset
   (depending on ``kind``). The ``val`` is the input value to be copied from
   the source. The ``mask_and_clamp`` contains two packed values specifying
   a mask for logically splitting warps into sub-segments and an upper bound
   for clamping the source lane index.

   The ``return_value_and_is_valid`` unit attribute can be specified to indicate
   that the return value is a two-element struct, where the first element is
   the result value and the second element is a predicate indicating if the
   computed source lane index is valid.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.shfl.sync'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: thread_mask() -> _ods_ir


   .. py:method:: val() -> _ods_ir


   .. py:method:: offset() -> _ods_ir


   .. py:method:: mask_and_clamp() -> _ods_ir


   .. py:method:: kind() -> _ods_ir


   .. py:method:: return_value_and_is_valid() -> bool


   .. py:method:: res() -> _ods_ir


.. py:function:: shfl_sync(res, thread_mask, val, offset, mask_and_clamp, kind, *, return_value_and_is_valid=None, loc=None, ip=None) -> _ods_ir

.. py:class:: SmDimOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.nsmid'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_nsmid(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: SmIdOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.smid'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_smid(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: StMatrixOp(ptr, sources, layout, shape, eltType, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Collectively store one or more matrices across all threads in a warp to the
   location indicated by the address operand $ptr in shared memory.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.stmatrix'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: ptr() -> _ods_ir


   .. py:method:: sources() -> _ods_ir


   .. py:method:: layout() -> _ods_ir


   .. py:method:: shape() -> _ods_ir


   .. py:method:: eltType() -> _ods_ir


.. py:function:: stmatrix(ptr, sources, layout, shape, elt_type, *, loc=None, ip=None) -> StMatrixOp

.. py:class:: SyncWarpOp(mask, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.bar.warp.sync`` operation performs barrier synchronization for threads
   within a warp.

   This operation causes the executing thread to wait until all threads corresponding
   to the ``mask`` operand have executed a ``bar.warp.sync`` with the same mask value
   before resuming execution.

   The ``mask`` operand specifies the threads participating in the barrier, where each
   bit position corresponds to the thread's lane ID within the warp. Only threads with
   their corresponding bit set in the mask participate in the barrier synchronization.

   **Important constraints**:

   * The behavior is undefined if the executing thread is not included in the mask
   (i.e., the bit corresponding to the thread's lane ID is not set)
   * For compute capability sm_6x or below, all threads in the mask must execute
   the same ``bar.warp.sync`` instruction in convergence

   This operation also guarantees memory ordering among participating threads.
   Threads within the warp that wish to communicate via memory can store to memory,
   execute ``bar.warp.sync``, and then safely read values stored by other threads
   in the warp.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.bar.warp.sync'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: mask() -> _ods_ir


.. py:function:: bar_warp_sync(mask, *, loc=None, ip=None) -> SyncWarpOp

.. py:class:: Tcgen05AllocOp(addr, nCols, *, group=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``tcgen05.alloc`` Op allocates tensor core memory for
   the amount specified by ``nCols`` and writes the destination
   address to the ``addr`` argument. The ``nCols`` operand specifies the
   number of columns to be allocated and it must be a power-of-two.
   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.tcgen05.alloc'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: nCols() -> _ods_ir


   .. py:method:: group() -> _ods_ir


.. py:function:: tcgen05_alloc(addr, n_cols, *, group=None, loc=None, ip=None) -> Tcgen05AllocOp

.. py:class:: Tcgen05CommitOp(addr, *, multicastMask=None, group=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``tcgen05.commit`` makes the *mbarrier object*, specified by
   the operand ``addr``, track the completion of all the prior
   async-tcgen05 operations initiated by the executing thread.
   The multicast variants allow signaling on the *mbarrier objects*
   of multiple CTAs within the cluster. Operand ``multicastMask``,
   when present, specifies the destination CTAs in the cluster such
   that each bit position in the 16-bit ``multicastMask`` operand
   corresponds to the ``nvvm.read.ptx.sreg.ctaid`` of the destination CTA.
   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen-async-sync-operations-commit>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.tcgen05.commit'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: addr() -> _ods_ir


   .. py:method:: multicastMask() -> Optional[_ods_ir]


   .. py:method:: group() -> _ods_ir


.. py:function:: tcgen05_commit(addr, *, multicast_mask=None, group=None, loc=None, ip=None) -> Tcgen05CommitOp

.. py:class:: Tcgen05CpOp(shape, taddr, smem_desc, *, group=None, multicast=None, srcFormat=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Instruction tcgen05.cp initiates an asynchronous copy operation from
   shared memory to the location specified by the address operand ``taddr``
   in the Tensor Memory. The 64-bit register operand ``smem_desc`` specifies
   the matrix descriptor representing the source matrix in the shared memory
   that needs to be copied.

   Example:

   .. code:: mlir

         nvvm.tcgen05.cp %taddr, %smem_desc {
           group = #nvvm.tcgen05_group<cta_2>,
           shape = #nvvm.tcgen05_cp_shape<shape_64x128b>,
           multicast = #nvvm.tcgen05_cp_multicast<warpx2_01_23>,
           srcFormat = #nvvm.tcgen05_cp_src_fmt<b6x16_p32>
         }

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-cp>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.tcgen05.cp'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: taddr() -> _ods_ir


   .. py:method:: smem_desc() -> _ods_ir


   .. py:method:: shape() -> _ods_ir


   .. py:method:: group() -> _ods_ir


   .. py:method:: multicast() -> _ods_ir


   .. py:method:: srcFormat() -> Optional[_ods_ir]


.. py:function:: tcgen05_cp(shape, taddr, smem_desc, *, group=None, multicast=None, src_format=None, loc=None, ip=None) -> Tcgen05CpOp

.. py:class:: Tcgen05DeallocOp(taddr, nCols, *, group=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``tcgen05.dealloc`` Op de-allocates the tensor core memory
   specified by ``tmemAddr``, which must be from a previous tensor
   memory allocation. The ``nCols`` operand specifies the number
   of columns to be de-allocated, and it must be a power-of-two.
   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.tcgen05.dealloc'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: taddr() -> _ods_ir


   .. py:method:: nCols() -> _ods_ir


   .. py:method:: group() -> _ods_ir


.. py:function:: tcgen05_dealloc(taddr, n_cols, *, group=None, loc=None, ip=None) -> Tcgen05DeallocOp

.. py:class:: Tcgen05FenceOp(kind, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``tcgen05.fence<before>`` orders all prior async tcgen05 operations
   with respect to the subsequent tcgen05 and execution ordering operations.
   The ``tcgen05.fence<after>`` orders all subsequent async tcgen05 operations
   with respect to the prior tcgen05 and execution ordering operations.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.tcgen05.fence'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: kind() -> _ods_ir


.. py:function:: tcgen05_fence(kind, *, loc=None, ip=None) -> Tcgen05FenceOp

.. py:class:: Tcgen05LdOp(res, shape, tmemAddr, *, pack=None, offset=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Instruction ``tcgen05.ld`` asynchronously loads data from the Tensor Memory at
   the location specified by the 32-bit address operand ``tmemAddr`` into the
   destination register ``res``, collectively across all threads of the warps.

   The ``shape`` and the ``num`` attribute together determines the total
   dimension of the data which is loaded from the Tensor Memory. The ``shape``
   attribute indicates the base dimension of data to be accessed as described
   in the Data Movement Shape. The ``num`` attribute indicates the repeat
   factor on the base dimension resulting in the total dimension of the data
   that is accessed.

   The shape ``16x32bx2`` performs two accesses into Tensor Memory of the shape
   ``16x32b``. The base address of the first access is specified by ``tmemAddr``
   and the base address of the second access is specified by
   ``tmemAddr + offset``, where ``offset`` is an immediate argument.

   The unit attribute ``pack`` can be used to pack two 16-bit
   elements from adjacent columns into a single 32-bit element during the load.

   The following table describes the size of the vector for various combinations
   of ``num`` and ``shape`` attributes:

   .. code::

       |=====================================================================|
       | num/shape      |     16x32bx2/16x64b/32x32b |  16x128b   | 16x256b  |
       |=====================================================================|
       | x1             |          1                 |    2       |    4     |
       | x2             |          2                 |    4       |    8     |
       | x4             |          4                 |    8       |    16    |
       | x8             |          8                 |    16      |    32    |
       | x16            |          16                |    32      |    64    |
       | x32            |          32                |    64      |    128   |
       | x64            |          64                |    128     |    NA    |
       | x128           |          128               |    NA      |    NA    |
       |=====================================================================|

   Example:

   .. code:: mlir

         nvvm.tcgen05.ld %tmemAddr, %offset pack {
           shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>,
         } : <2xi32>

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.tcgen05.ld'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: tmemAddr() -> _ods_ir


   .. py:method:: offset() -> Optional[_ods_ir]


   .. py:method:: pack() -> bool


   .. py:method:: shape() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: tcgen05_ld(res, shape, tmem_addr, *, pack=None, offset=None, loc=None, ip=None) -> _ods_ir

.. py:class:: Tcgen05MmaSmemDescOp(res, startAddr, leadingDimOffset, strideDimOffset, baseOffset, leadingDimMode, swizzleMode, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``nvvm.tcgen05_mma_smem_desc`` constructs a Shared Memory descriptor
   for tcgen05.mma. This descriptor is a 64-bit value which describes the
   properties of multiplicand matrix in shared memory including its location
   in the shared memory of the current CTA.

   .. code::

       +-----------+------+------------------------------------------------------+
       | Bit-field | Size | Description                                          |
       +-----------+------+------------------------------------------------------+
       | 0-13      | 14   | Matrix start address                                 |
       | 14-15     | 2    | Reserved                                             |
       | 16-29     | 14   | Leading dim relative-offset (or) absolute-address    |
       | 30-31     | 2    | Reserved                                             |
       | 32-45     | 14   | Stride dimension byte offset                         |
       | 46-48     | 3    | Fixed constant value of 0b001                        |
       | 49-51     | 3    | Matrix base offset                                   |
       | 52        | 1    | Leading dimension stride mode:                       |
       |           |      |   0: byte offset relative                            |
       |           |      |   1: byte address absolute                           |
       | 53-60     | 8    | Fixed constant value of 0xb00000000                  |
       | 61-63     | 3    | Swizzling mode:                                      |
       |           |      |   0: No swizzling                                    |
       |           |      |   1: 128-Byte with 32B atomic swizzling              |
       |           |      |   2: 128-Byte swizzling                              |
       |           |      |   4: 64-Byte swizzling                               |
       |           |      |   6: 32-Byte swizzling                               |
       |           |      |   (Values 3, 5 and 7 are invalid)                    |
       +-----------+------+------------------------------------------------------+    

   Example:

   .. code:: mlir

         %desc = nvvm.tcgen05.mma_smem_desc (%startAddr, %leadingDimOffset, %strideDimOffset,
                                             %baseOffset, %leadingDimMode, %swizzleMode) : (i32, i32, i32, i8, i1, i8) -> i64

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-shared-memory-descriptor>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.tcgen05.mma_smem_desc'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: startAddr() -> _ods_ir


   .. py:method:: leadingDimOffset() -> _ods_ir


   .. py:method:: strideDimOffset() -> _ods_ir


   .. py:method:: baseOffset() -> _ods_ir


   .. py:method:: leadingDimMode() -> _ods_ir


   .. py:method:: swizzleMode() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: tcgen05_mma_smem_desc(res, start_addr, leading_dim_offset, stride_dim_offset, base_offset, leading_dim_mode, swizzle_mode, *, loc=None, ip=None) -> _ods_ir

.. py:class:: Tcgen05RelinquishAllocPermitOp(*, group=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``tcgen05.relinquish_alloc_permit`` Op specifies that the CTA
   of the executing thread is relinquishing the right to allocate
   Tensor Memory. So, it is illegal for a CTA to perform ``tcgen05.alloc``
   after any of its constituent threads execute ``tcgen05.relinquish_alloc_permit``.
   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.tcgen05.relinquish_alloc_permit'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: group() -> _ods_ir


.. py:function:: tcgen05_relinquish_alloc_permit(*, group=None, loc=None, ip=None) -> Tcgen05RelinquishAllocPermitOp

.. py:class:: Tcgen05ShiftOp(taddr, *, group=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``tcgen05.shift`` is an asynchronous instruction which initiates
   the shifting of 32-byte elements downwards across all the rows,
   except the last, by one row. The operand ``taddr`` specifies the base
   address of the matrix in Tensor Memory whose rows must be down shifted.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-shift>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.tcgen05.shift'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: taddr() -> _ods_ir


   .. py:method:: group() -> _ods_ir


.. py:function:: tcgen05_shift(taddr, *, group=None, loc=None, ip=None) -> Tcgen05ShiftOp

.. py:class:: Tcgen05StOp(shape, tmemAddr, val, *, unpack=None, offset=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Instruction ``tcgen05.st`` asynchronously stores data from the source register ``r``
   into the Tensor Memory at the location specified by the 32-bit address operand
   ``tmemAddr``, collectively across all threads of the warps.

   The ``shape`` and the ``num`` attribute together determines the total dimension of
   the data which is stored to the Tensor Memory. The ``shape`` indicates the base
   dimension of data to be accessed. The ``num`` attribute indicates the repeat
   factor on the base dimension resulting in the total dimension of the data that
   is accessed.

   The shape ``16x32bx2`` performs two accesses into Tensor Memory of the shape
   ``16x32b``. The base address of the first access is specified by ``tmemAddr``
   and the base address of the second access is specified by
   ``tmemAddr + offset``, where ``offset`` is an immediate argument.

   The unit attribute ``unpack`` can be used to unpack a 32-bit element
   in the register into two 16-bit elements and store them in adjacent columns.

   The following table describes the size of the vector for various combinations
   of ``num`` and ``shape`` attributes:

   .. code::

       |=====================================================================|
       | num/shape      |     16x32bx2/16x64b/32x32b |  16x128b   | 16x256b  |
       |=====================================================================|
       | x1             |          1                 |    2       |    4     |
       | x2             |          2                 |    4       |    8     |
       | x4             |          4                 |    8       |    16    |
       | x8             |          8                 |    16      |    32    |
       | x16            |          16                |    32      |    64    |
       | x32            |          32                |    64      |    128   |
       | x64            |          64                |    128     |    NA    |
       | x128           |          128               |    NA      |    NA    |
       |=====================================================================|

   Example:

   .. code:: mlir

         nvvm.tcgen05.st %tmemAddr, %val, %offset unpack {
           shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>,
         } : <2xi32>

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.tcgen05.st'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: tmemAddr() -> _ods_ir


   .. py:method:: val() -> _ods_ir


   .. py:method:: offset() -> Optional[_ods_ir]


   .. py:method:: unpack() -> bool


   .. py:method:: shape() -> _ods_ir


.. py:function:: tcgen05_st(shape, tmem_addr, val, *, unpack=None, offset=None, loc=None, ip=None) -> Tcgen05StOp

.. py:class:: Tcgen05WaitOp(kind, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``tcgen05.wait<load>`` causes the executing thread to block until
   all prior ``tcgen05.ld`` operations issued by the executing thread
   have completed. Similarly, the ``tcgen05.wait<store>`` causes the executing
   thread to block until all prior ``tcgen05.st`` operations issued by the
   executing thread have completed.
   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-wait>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.tcgen05.wait'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: kind() -> _ods_ir


.. py:function:: tcgen05_wait(kind, *, loc=None, ip=None) -> Tcgen05WaitOp

.. py:class:: ThreadIdXOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.tid.x'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_tid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ThreadIdYOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.tid.y'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_tid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: ThreadIdZOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.tid.z'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_tid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: VoteSyncOp(res, mask, pred, kind, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The ``vote.sync`` op will cause executing thread to wait until all non-exited
   threads corresponding to membermask have executed ``vote.sync`` with the same
   qualifiers and same membermask value before resuming execution.

   The vote operation kinds are:

   * ``any``: True if source predicate is True for some thread in membermask.
   * ``all``: True if source predicate is True for all non-exited threads in
   membermask.
   * ``uni``: True if source predicate has the same value in all non-exited
   threads in membermask.
   * ``ballot``: In the ballot form, the destination result is a 32 bit integer.
   In this form, the predicate from each thread in membermask are copied into
   the corresponding bit position of the result, where the bit position
   corresponds to the thread's lane id.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-vote-sync>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.vote.sync'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: mask() -> _ods_ir


   .. py:method:: pred() -> _ods_ir


   .. py:method:: kind() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: vote_sync(res, mask, pred, kind, *, loc=None, ip=None) -> _ods_ir

.. py:class:: WMMALoadOp(res, ptr, stride, m, n, k, layout, eltype, frag, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.wmma.load'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: ptr() -> _ods_ir


   .. py:method:: stride() -> _ods_ir


   .. py:method:: m() -> _ods_ir


   .. py:method:: n() -> _ods_ir


   .. py:method:: k() -> _ods_ir


   .. py:method:: layout() -> _ods_ir


   .. py:method:: eltype() -> _ods_ir


   .. py:method:: frag() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: wmma_load(res, ptr, stride, m, n, k, layout, eltype, frag, *, loc=None, ip=None) -> _ods_ir

.. py:class:: WMMAMmaOp(res, m, n, k, layoutA, layoutB, eltypeA, eltypeB, args, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.wmma.mma'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: args() -> _ods_ir


   .. py:method:: m() -> _ods_ir


   .. py:method:: n() -> _ods_ir


   .. py:method:: k() -> _ods_ir


   .. py:method:: layoutA() -> _ods_ir


   .. py:method:: layoutB() -> _ods_ir


   .. py:method:: eltypeA() -> _ods_ir


   .. py:method:: eltypeB() -> _ods_ir


   .. py:method:: res() -> _ods_ir


.. py:function:: wmma_mma(res, m, n, k, layout_a, layout_b, eltype_a, eltype_b, args, *, loc=None, ip=None) -> _ods_ir

.. py:class:: WMMAStoreOp(ptr, m, n, k, layout, eltype, args, stride, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.wmma.store'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: ptr() -> _ods_ir


   .. py:method:: args() -> _ods_ir


   .. py:method:: stride() -> _ods_ir


   .. py:method:: m() -> _ods_ir


   .. py:method:: n() -> _ods_ir


   .. py:method:: k() -> _ods_ir


   .. py:method:: layout() -> _ods_ir


   .. py:method:: eltype() -> _ods_ir


.. py:function:: wmma_store(ptr, m, n, k, layout, eltype, args, stride, *, loc=None, ip=None) -> WMMAStoreOp

.. py:class:: WarpDimOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.nwarpid'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_nwarpid(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: WarpIdOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.warpid'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_warpid(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: WarpSizeOp(res, *, range=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.read.ptx.sreg.warpsize'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: range() -> Optional[_ods_ir]


   .. py:method:: res() -> _ods_ir


.. py:function:: read_ptx_sreg_warpsize(res, *, range=None, loc=None, ip=None) -> _ods_ir

.. py:class:: WgmmaFenceAlignedOp(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Enforce an ordering of register accesses between warpgroup level matrix
   multiplication and other operations.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-fence>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.wgmma.fence.aligned'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: wgmma_fence_aligned(*, loc=None, ip=None) -> WgmmaFenceAlignedOp

.. py:class:: WgmmaGroupSyncAlignedOp(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Commits all prior uncommitted warpgroup level matrix multiplication operations.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.wgmma.commit.group.sync.aligned'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: wgmma_commit_group_sync_aligned(*, loc=None, ip=None) -> WgmmaGroupSyncAlignedOp

.. py:class:: WgmmaMmaAsyncOp(results_, inouts, descriptorA, descriptorB, shape, typeA, typeB, typeD, scaleD, scaleA, scaleB, layoutA, layoutB, *, satfinite=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   The warpgroup (128 threads) level matrix multiply and accumulate operation
   has either of the following forms, where matrix D is called accumulator:
   D = A * B + D
   D = A * B, where the input from accumulator D is disabled.

   Supported shapes:

   .. code::

       |--------------|--------------|------------|--------------|---------------|
       |              |              |            |              |f16+=e4m3*e4m3 |
       |              |              |            |              |f16+=e5m2*e5m2 |
       |f32+=tf32*tf32|f16+=f16 *f16 | s32+=s8*s8 |s32 += b1 * b1|f16+=e5m2*e4m3 |
       |              |f32+=f16 *f16 | s32+=u8*u8 |              |f16+=e4m3*e5m2 |
       |              |f32+=bf16*bf16| s32+=u8*u8 |              |f16+=e4m3*e5m2 |
       |              |f32+=bf16*bf16| s32+=s8*u8 |              |f32+=e4m3*e4m3 |
       |              |              | s32+=u8*s8 |              |f32+=e5m2*e5m2 |
       |              |              |            |              |f32+=e4m3*e5m2 |
       |              |              |            |              |f32+=e4m3*e5m2 |
       |--------------|--------------|------------|--------------|---------------|
       |   .m64n8k8   |  .m64n8k16   | .m64n8k32  | .m64n8k256   | .m64n8k32     |
       |   .m64n16k8  |  .m64n16k16  | .m64n16k32 | .m64n16k256  | .m64n16k32    |
       |   .m64n24k8  |  .m64n24k16  | .m64n24k32 | .m64n24k256  | .m64n24k32    |
       |   .m64n32k8  |  .m64n32k16  | .m64n32k32 | .m64n32k256  | .m64n32k32    |
       |   .m64n40k8  |  .m64n40k16  | .m64n48k32 | .m64n48k256  | .m64n40k32    |
       |   .m64n48k8  |  .m64n48k16  | .m64n64k32 | .m64n64k256  | .m64n48k32    |
       |   .m64n56k8  |  .m64n56k16  | .m64n80k32 | .m64n80k256  | .m64n56k32    |
       |   .m64n64k8  |  .m64n64k16  | .m64n96k32 | .m64n96k256  | .m64n64k32    |
       |   .m64n72k8  |  .m64n72k16  | .m64n112k32| .m64n112k256 | .m64n72k32    |
       |   .m64n80k8  |  .m64n80k16  | .m64n128k32| .m64n128k256 | .m64n80k32    |
       |   .m64n88k8  |  .m64n88k16  | .m64n144k32| .m64n144k256 | .m64n88k32    |
       |   .m64n96k8  |  .m64n96k16  | .m64n160k32| .m64n160k256 | .m64n96k32    |
       |   .m64n104k8 |  .m64n104k16 | .m64n176k32| .m64n176k256 | .m64n104k32   |
       |   .m64n112k8 |  .m64n112k16 | .m64n192k32| .m64n192k256 | .m64n112k32   |
       |   .m64n120k8 |  .m64n120k16 | .m64n208k32| .m64n208k256 | .m64n120k32   |
       |   .m64n128k8 |  .m64n128k16 | .m64n224k32| .m64n224k256 | .m64n128k32   |
       |   .m64n136k8 |  .m64n136k16 | .m64n240k32| .m64n240k256 | .m64n136k32   |
       |   .m64n144k8 |  .m64n144k16 | .m64n256k32| .m64n256k256 | .m64n144k32   |
       |   .m64n152k8 |  .m64n152k16 |            |              | .m64n152k32   |
       |   .m64n160k8 |  .m64n160k16 |            |              | .m64n160k32   |
       |   .m64n168k8 |  .m64n168k16 |            |              | .m64n168k32   |
       |   .m64n176k8 |  .m64n176k16 |            |              | .m64n176k32   |
       |   .m64n184k8 |  .m64n184k16 |            |              | .m64n184k32   |
       |   .m64n192k8 |  .m64n192k16 |            |              | .m64n192k32   |
       |   .m64n200k8 |  .m64n200k16 |            |              | .m64n200k32   |
       |   .m64n208k8 |  .m64n208k16 |            |              | .m64n208k32   |
       |   .m64n216k8 |  .m64n216k16 |            |              | .m64n216k32   |
       |   .m64n224k8 |  .m64n224k16 |            |              | .m64n224k32   |
       |   .m64n232k8 |  .m64n232k16 |            |              | .m64n232k32   |
       |   .m64n240k8 |  .m64n240k16 |            |              | .m64n240k32   |
       |   .m64n248k8 |  .m64n248k16 |            |              | .m64n248k32   |
       |   .m64n256k8 |  .m64n256k16 |            |              | .m64n256k32   |
       |--------------|--------------|------------|--------------|---------------|

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.wgmma.mma_async'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: inouts() -> _ods_ir


   .. py:method:: descriptorA() -> _ods_ir


   .. py:method:: descriptorB() -> _ods_ir


   .. py:method:: shape() -> _ods_ir


   .. py:method:: typeA() -> _ods_ir


   .. py:method:: typeB() -> _ods_ir


   .. py:method:: typeD() -> _ods_ir


   .. py:method:: scaleD() -> _ods_ir


   .. py:method:: scaleA() -> _ods_ir


   .. py:method:: scaleB() -> _ods_ir


   .. py:method:: layoutA() -> _ods_ir


   .. py:method:: layoutB() -> _ods_ir


   .. py:method:: satfinite() -> Optional[_ods_ir]


   .. py:method:: results_() -> _ods_ir


.. py:function:: wgmma_mma_async(results_, inouts, descriptor_a, descriptor_b, shape, type_a, type_b, type_d, scale_d, scale_a, scale_b, layout_a, layout_b, *, satfinite=None, loc=None, ip=None) -> _ods_ir

.. py:class:: WgmmaWaitGroupSyncOp(group, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Signal the completion of a preceding warpgroup operation.

   `For more information, see PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-wait-group>`_


   .. py:attribute:: OPERATION_NAME
      :value: 'nvvm.wgmma.wait.group.sync.aligned'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: group() -> _ods_ir


.. py:function:: wgmma_wait_group_sync_aligned(group, *, loc=None, ip=None) -> WgmmaWaitGroupSyncOp