mlir.dialects._nvvm_ops_gen =========================== .. py:module:: mlir.dialects._nvvm_ops_gen Attributes ---------- .. autoapisummary:: mlir.dialects._nvvm_ops_gen._ods_ir Classes ------- .. autoapisummary:: mlir.dialects._nvvm_ops_gen._Dialect mlir.dialects._nvvm_ops_gen.Barrier0Op mlir.dialects._nvvm_ops_gen.BarrierArriveOp mlir.dialects._nvvm_ops_gen.BarrierOp mlir.dialects._nvvm_ops_gen.BlockDimXOp mlir.dialects._nvvm_ops_gen.BlockDimYOp mlir.dialects._nvvm_ops_gen.BlockDimZOp mlir.dialects._nvvm_ops_gen.BlockIdXOp mlir.dialects._nvvm_ops_gen.BlockIdYOp mlir.dialects._nvvm_ops_gen.BlockIdZOp mlir.dialects._nvvm_ops_gen.BlockInClusterIdXOp mlir.dialects._nvvm_ops_gen.BlockInClusterIdYOp mlir.dialects._nvvm_ops_gen.BlockInClusterIdZOp mlir.dialects._nvvm_ops_gen.Breakpoint mlir.dialects._nvvm_ops_gen.BulkStoreOp mlir.dialects._nvvm_ops_gen.Clock64Op mlir.dialects._nvvm_ops_gen.ClockOp mlir.dialects._nvvm_ops_gen.ClusterArriveOp mlir.dialects._nvvm_ops_gen.ClusterArriveRelaxedOp mlir.dialects._nvvm_ops_gen.ClusterDim mlir.dialects._nvvm_ops_gen.ClusterDimBlocksXOp mlir.dialects._nvvm_ops_gen.ClusterDimBlocksYOp mlir.dialects._nvvm_ops_gen.ClusterDimBlocksZOp mlir.dialects._nvvm_ops_gen.ClusterDimXOp mlir.dialects._nvvm_ops_gen.ClusterDimYOp mlir.dialects._nvvm_ops_gen.ClusterDimZOp mlir.dialects._nvvm_ops_gen.ClusterId mlir.dialects._nvvm_ops_gen.ClusterIdXOp mlir.dialects._nvvm_ops_gen.ClusterIdYOp mlir.dialects._nvvm_ops_gen.ClusterIdZOp mlir.dialects._nvvm_ops_gen.ClusterLaunchControlQueryCancelOp mlir.dialects._nvvm_ops_gen.ClusterLaunchControlTryCancelOp mlir.dialects._nvvm_ops_gen.ClusterWaitOp mlir.dialects._nvvm_ops_gen.ConvertBF16x2ToF8x2Op mlir.dialects._nvvm_ops_gen.ConvertF4x2ToF16x2Op mlir.dialects._nvvm_ops_gen.ConvertF6x2ToF16x2Op mlir.dialects._nvvm_ops_gen.ConvertF8x2ToBF16x2Op mlir.dialects._nvvm_ops_gen.ConvertF8x2ToF16x2Op mlir.dialects._nvvm_ops_gen.ConvertF16x2ToF8x2Op mlir.dialects._nvvm_ops_gen.ConvertF32x2ToBF16x2Op mlir.dialects._nvvm_ops_gen.ConvertF32x2ToF4x2Op mlir.dialects._nvvm_ops_gen.ConvertF32x2ToF6x2Op mlir.dialects._nvvm_ops_gen.ConvertF32x2ToF8x2Op mlir.dialects._nvvm_ops_gen.ConvertF32x2ToF16x2Op mlir.dialects._nvvm_ops_gen.ConvertF32x4ToF4x4Op mlir.dialects._nvvm_ops_gen.ConvertF32x4ToF6x4Op mlir.dialects._nvvm_ops_gen.ConvertF32x4ToF8x4Op mlir.dialects._nvvm_ops_gen.ConvertFloatToTF32Op mlir.dialects._nvvm_ops_gen.CpAsyncBulkCommitGroupOp mlir.dialects._nvvm_ops_gen.CpAsyncBulkGlobalToSharedClusterOp mlir.dialects._nvvm_ops_gen.CpAsyncBulkPrefetchOp mlir.dialects._nvvm_ops_gen.CpAsyncBulkSharedCTAToGlobalOp mlir.dialects._nvvm_ops_gen.CpAsyncBulkSharedCTAToSharedClusterOp mlir.dialects._nvvm_ops_gen.CpAsyncBulkTensorGlobalToSharedClusterOp mlir.dialects._nvvm_ops_gen.CpAsyncBulkTensorPrefetchOp mlir.dialects._nvvm_ops_gen.CpAsyncBulkTensorReduceOp mlir.dialects._nvvm_ops_gen.CpAsyncBulkTensorSharedCTAToGlobalOp mlir.dialects._nvvm_ops_gen.CpAsyncBulkWaitGroupOp mlir.dialects._nvvm_ops_gen.CpAsyncCommitGroupOp mlir.dialects._nvvm_ops_gen.CpAsyncMBarrierArriveOp mlir.dialects._nvvm_ops_gen.CpAsyncOp mlir.dialects._nvvm_ops_gen.CpAsyncWaitGroupOp mlir.dialects._nvvm_ops_gen.DotAccumulate2WayOp mlir.dialects._nvvm_ops_gen.DotAccumulate4WayOp mlir.dialects._nvvm_ops_gen.ElectSyncOp mlir.dialects._nvvm_ops_gen.EnvReg0Op mlir.dialects._nvvm_ops_gen.EnvReg1Op mlir.dialects._nvvm_ops_gen.EnvReg2Op mlir.dialects._nvvm_ops_gen.EnvReg3Op mlir.dialects._nvvm_ops_gen.EnvReg4Op mlir.dialects._nvvm_ops_gen.EnvReg5Op mlir.dialects._nvvm_ops_gen.EnvReg6Op mlir.dialects._nvvm_ops_gen.EnvReg7Op mlir.dialects._nvvm_ops_gen.EnvReg8Op mlir.dialects._nvvm_ops_gen.EnvReg9Op mlir.dialects._nvvm_ops_gen.EnvReg10Op mlir.dialects._nvvm_ops_gen.EnvReg11Op mlir.dialects._nvvm_ops_gen.EnvReg12Op mlir.dialects._nvvm_ops_gen.EnvReg13Op mlir.dialects._nvvm_ops_gen.EnvReg14Op mlir.dialects._nvvm_ops_gen.EnvReg15Op mlir.dialects._nvvm_ops_gen.EnvReg16Op mlir.dialects._nvvm_ops_gen.EnvReg17Op mlir.dialects._nvvm_ops_gen.EnvReg18Op mlir.dialects._nvvm_ops_gen.EnvReg19Op mlir.dialects._nvvm_ops_gen.EnvReg20Op mlir.dialects._nvvm_ops_gen.EnvReg21Op mlir.dialects._nvvm_ops_gen.EnvReg22Op mlir.dialects._nvvm_ops_gen.EnvReg23Op mlir.dialects._nvvm_ops_gen.EnvReg24Op mlir.dialects._nvvm_ops_gen.EnvReg25Op mlir.dialects._nvvm_ops_gen.EnvReg26Op mlir.dialects._nvvm_ops_gen.EnvReg27Op mlir.dialects._nvvm_ops_gen.EnvReg28Op mlir.dialects._nvvm_ops_gen.EnvReg29Op mlir.dialects._nvvm_ops_gen.EnvReg30Op mlir.dialects._nvvm_ops_gen.EnvReg31Op mlir.dialects._nvvm_ops_gen.Exit mlir.dialects._nvvm_ops_gen.FenceMbarrierInitOp mlir.dialects._nvvm_ops_gen.FenceProxyAcquireOp mlir.dialects._nvvm_ops_gen.FenceProxyOp mlir.dialects._nvvm_ops_gen.FenceProxyReleaseOp mlir.dialects._nvvm_ops_gen.FenceScClusterOp mlir.dialects._nvvm_ops_gen.GlobalTimerLoOp mlir.dialects._nvvm_ops_gen.GlobalTimerOp mlir.dialects._nvvm_ops_gen.GridDimXOp mlir.dialects._nvvm_ops_gen.GridDimYOp mlir.dialects._nvvm_ops_gen.GridDimZOp mlir.dialects._nvvm_ops_gen.GridIdOp mlir.dialects._nvvm_ops_gen.GriddepcontrolOp mlir.dialects._nvvm_ops_gen.InlinePtxOp mlir.dialects._nvvm_ops_gen.LaneIdOp mlir.dialects._nvvm_ops_gen.LaneMaskEqOp mlir.dialects._nvvm_ops_gen.LaneMaskGeOp mlir.dialects._nvvm_ops_gen.LaneMaskGtOp mlir.dialects._nvvm_ops_gen.LaneMaskLeOp mlir.dialects._nvvm_ops_gen.LaneMaskLtOp mlir.dialects._nvvm_ops_gen.LdMatrixOp mlir.dialects._nvvm_ops_gen.MBarrierArriveExpectTxOp mlir.dialects._nvvm_ops_gen.MBarrierArriveNocompleteOp mlir.dialects._nvvm_ops_gen.MBarrierArriveOp mlir.dialects._nvvm_ops_gen.MBarrierInitOp mlir.dialects._nvvm_ops_gen.MBarrierInvalOp mlir.dialects._nvvm_ops_gen.MBarrierTestWaitOp mlir.dialects._nvvm_ops_gen.MBarrierTryWaitParityOp mlir.dialects._nvvm_ops_gen.MapaOp mlir.dialects._nvvm_ops_gen.MatchSyncOp mlir.dialects._nvvm_ops_gen.MembarOp mlir.dialects._nvvm_ops_gen.MmaOp mlir.dialects._nvvm_ops_gen.NanosleepOp mlir.dialects._nvvm_ops_gen.PMEventOp mlir.dialects._nvvm_ops_gen.PrefetchOp mlir.dialects._nvvm_ops_gen.RcpApproxFtzF32Op mlir.dialects._nvvm_ops_gen.ReduxOp mlir.dialects._nvvm_ops_gen.SetMaxRegisterOp mlir.dialects._nvvm_ops_gen.ShflOp mlir.dialects._nvvm_ops_gen.SmDimOp mlir.dialects._nvvm_ops_gen.SmIdOp mlir.dialects._nvvm_ops_gen.StMatrixOp mlir.dialects._nvvm_ops_gen.SyncWarpOp mlir.dialects._nvvm_ops_gen.Tcgen05AllocOp mlir.dialects._nvvm_ops_gen.Tcgen05CommitOp mlir.dialects._nvvm_ops_gen.Tcgen05CpOp mlir.dialects._nvvm_ops_gen.Tcgen05DeallocOp mlir.dialects._nvvm_ops_gen.Tcgen05FenceOp mlir.dialects._nvvm_ops_gen.Tcgen05LdOp mlir.dialects._nvvm_ops_gen.Tcgen05MmaSmemDescOp mlir.dialects._nvvm_ops_gen.Tcgen05RelinquishAllocPermitOp mlir.dialects._nvvm_ops_gen.Tcgen05ShiftOp mlir.dialects._nvvm_ops_gen.Tcgen05StOp mlir.dialects._nvvm_ops_gen.Tcgen05WaitOp mlir.dialects._nvvm_ops_gen.ThreadIdXOp mlir.dialects._nvvm_ops_gen.ThreadIdYOp mlir.dialects._nvvm_ops_gen.ThreadIdZOp mlir.dialects._nvvm_ops_gen.VoteSyncOp mlir.dialects._nvvm_ops_gen.WMMALoadOp mlir.dialects._nvvm_ops_gen.WMMAMmaOp mlir.dialects._nvvm_ops_gen.WMMAStoreOp mlir.dialects._nvvm_ops_gen.WarpDimOp mlir.dialects._nvvm_ops_gen.WarpIdOp mlir.dialects._nvvm_ops_gen.WarpSizeOp mlir.dialects._nvvm_ops_gen.WgmmaFenceAlignedOp mlir.dialects._nvvm_ops_gen.WgmmaGroupSyncAlignedOp mlir.dialects._nvvm_ops_gen.WgmmaMmaAsyncOp mlir.dialects._nvvm_ops_gen.WgmmaWaitGroupSyncOp Functions --------- .. autoapisummary:: mlir.dialects._nvvm_ops_gen.barrier0 mlir.dialects._nvvm_ops_gen.barrier_arrive mlir.dialects._nvvm_ops_gen.barrier mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ntid_x mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ntid_y mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ntid_z mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ctaid_x mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ctaid_y mlir.dialects._nvvm_ops_gen.read_ptx_sreg_ctaid_z mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_ctaid_x mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_ctaid_y mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_ctaid_z mlir.dialects._nvvm_ops_gen.breakpoint mlir.dialects._nvvm_ops_gen.st_bulk mlir.dialects._nvvm_ops_gen.read_ptx_sreg_clock64 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_clock mlir.dialects._nvvm_ops_gen.cluster_arrive mlir.dialects._nvvm_ops_gen.cluster_arrive_relaxed mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_nctarank mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_nctaid_x mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_nctaid_y mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_nctaid_z mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nclusterid_x mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nclusterid_y mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nclusterid_z mlir.dialects._nvvm_ops_gen.read_ptx_sreg_cluster_ctarank mlir.dialects._nvvm_ops_gen.read_ptx_sreg_clusterid_x mlir.dialects._nvvm_ops_gen.read_ptx_sreg_clusterid_y mlir.dialects._nvvm_ops_gen.read_ptx_sreg_clusterid_z mlir.dialects._nvvm_ops_gen.clusterlaunchcontrol_query_cancel mlir.dialects._nvvm_ops_gen.clusterlaunchcontrol_try_cancel mlir.dialects._nvvm_ops_gen.cluster_wait mlir.dialects._nvvm_ops_gen.convert_bf16x2_to_f8x2 mlir.dialects._nvvm_ops_gen.convert_f4x2_to_f16x2 mlir.dialects._nvvm_ops_gen.convert_f6x2_to_f16x2 mlir.dialects._nvvm_ops_gen.convert_f8x2_to_bf16x2 mlir.dialects._nvvm_ops_gen.convert_f8x2_to_f16x2 mlir.dialects._nvvm_ops_gen.convert_f16x2_to_f8x2 mlir.dialects._nvvm_ops_gen.convert_f32x2_to_bf16x2 mlir.dialects._nvvm_ops_gen.convert_f32x2_to_f4x2 mlir.dialects._nvvm_ops_gen.convert_f32x2_to_f6x2 mlir.dialects._nvvm_ops_gen.convert_f32x2_to_f8x2 mlir.dialects._nvvm_ops_gen.convert_f32x2_to_f16x2 mlir.dialects._nvvm_ops_gen.convert_f32x4_to_f4x4 mlir.dialects._nvvm_ops_gen.convert_f32x4_to_f6x4 mlir.dialects._nvvm_ops_gen.convert_f32x4_to_f8x4 mlir.dialects._nvvm_ops_gen.convert_float_to_tf32 mlir.dialects._nvvm_ops_gen.cp_async_bulk_commit_group mlir.dialects._nvvm_ops_gen.cp_async_bulk_shared_cluster_global mlir.dialects._nvvm_ops_gen.cp_async_bulk_prefetch mlir.dialects._nvvm_ops_gen.cp_async_bulk_global_shared_cta mlir.dialects._nvvm_ops_gen.cp_async_bulk_shared_cluster_shared_cta mlir.dialects._nvvm_ops_gen.cp_async_bulk_tensor_shared_cluster_global mlir.dialects._nvvm_ops_gen.cp_async_bulk_tensor_prefetch mlir.dialects._nvvm_ops_gen.cp_async_bulk_tensor_reduce mlir.dialects._nvvm_ops_gen.cp_async_bulk_tensor_global_shared_cta mlir.dialects._nvvm_ops_gen.cp_async_bulk_wait_group mlir.dialects._nvvm_ops_gen.cp_async_commit_group mlir.dialects._nvvm_ops_gen.cp_async_mbarrier_arrive mlir.dialects._nvvm_ops_gen.cp_async_shared_global mlir.dialects._nvvm_ops_gen.cp_async_wait_group mlir.dialects._nvvm_ops_gen.dot_accumulate_2way mlir.dialects._nvvm_ops_gen.dot_accumulate_4way mlir.dialects._nvvm_ops_gen.elect_sync mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg0 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg1 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg2 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg3 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg4 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg5 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg6 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg7 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg8 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg9 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg10 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg11 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg12 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg13 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg14 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg15 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg16 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg17 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg18 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg19 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg20 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg21 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg22 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg23 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg24 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg25 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg26 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg27 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg28 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg29 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg30 mlir.dialects._nvvm_ops_gen.read_ptx_sreg_envreg31 mlir.dialects._nvvm_ops_gen.exit mlir.dialects._nvvm_ops_gen.fence_mbarrier_init mlir.dialects._nvvm_ops_gen.fence_proxy_acquire mlir.dialects._nvvm_ops_gen.fence_proxy mlir.dialects._nvvm_ops_gen.fence_proxy_release mlir.dialects._nvvm_ops_gen.fence_sc_cluster mlir.dialects._nvvm_ops_gen.read_ptx_sreg_globaltimer_lo mlir.dialects._nvvm_ops_gen.read_ptx_sreg_globaltimer mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nctaid_x mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nctaid_y mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nctaid_z mlir.dialects._nvvm_ops_gen.read_ptx_sreg_gridid mlir.dialects._nvvm_ops_gen.griddepcontrol mlir.dialects._nvvm_ops_gen.inline_ptx mlir.dialects._nvvm_ops_gen.read_ptx_sreg_laneid mlir.dialects._nvvm_ops_gen.read_ptx_sreg_lanemask_eq mlir.dialects._nvvm_ops_gen.read_ptx_sreg_lanemask_ge mlir.dialects._nvvm_ops_gen.read_ptx_sreg_lanemask_gt mlir.dialects._nvvm_ops_gen.read_ptx_sreg_lanemask_le mlir.dialects._nvvm_ops_gen.read_ptx_sreg_lanemask_lt mlir.dialects._nvvm_ops_gen.ldmatrix mlir.dialects._nvvm_ops_gen.mbarrier_arrive_expect_tx mlir.dialects._nvvm_ops_gen.mbarrier_arrive_nocomplete mlir.dialects._nvvm_ops_gen.mbarrier_arrive mlir.dialects._nvvm_ops_gen.mbarrier_init mlir.dialects._nvvm_ops_gen.mbarrier_inval mlir.dialects._nvvm_ops_gen.mbarrier_test_wait mlir.dialects._nvvm_ops_gen.mbarrier_try_wait_parity mlir.dialects._nvvm_ops_gen.mapa mlir.dialects._nvvm_ops_gen.match_sync mlir.dialects._nvvm_ops_gen.memory_barrier mlir.dialects._nvvm_ops_gen.mma_sync mlir.dialects._nvvm_ops_gen.nanosleep mlir.dialects._nvvm_ops_gen.pmevent mlir.dialects._nvvm_ops_gen.prefetch mlir.dialects._nvvm_ops_gen.rcp_approx_ftz_f mlir.dialects._nvvm_ops_gen.redux_sync mlir.dialects._nvvm_ops_gen.setmaxregister mlir.dialects._nvvm_ops_gen.shfl_sync mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nsmid mlir.dialects._nvvm_ops_gen.read_ptx_sreg_smid mlir.dialects._nvvm_ops_gen.stmatrix mlir.dialects._nvvm_ops_gen.bar_warp_sync mlir.dialects._nvvm_ops_gen.tcgen05_alloc mlir.dialects._nvvm_ops_gen.tcgen05_commit mlir.dialects._nvvm_ops_gen.tcgen05_cp mlir.dialects._nvvm_ops_gen.tcgen05_dealloc mlir.dialects._nvvm_ops_gen.tcgen05_fence mlir.dialects._nvvm_ops_gen.tcgen05_ld mlir.dialects._nvvm_ops_gen.tcgen05_mma_smem_desc mlir.dialects._nvvm_ops_gen.tcgen05_relinquish_alloc_permit mlir.dialects._nvvm_ops_gen.tcgen05_shift mlir.dialects._nvvm_ops_gen.tcgen05_st mlir.dialects._nvvm_ops_gen.tcgen05_wait mlir.dialects._nvvm_ops_gen.read_ptx_sreg_tid_x mlir.dialects._nvvm_ops_gen.read_ptx_sreg_tid_y mlir.dialects._nvvm_ops_gen.read_ptx_sreg_tid_z mlir.dialects._nvvm_ops_gen.vote_sync mlir.dialects._nvvm_ops_gen.wmma_load mlir.dialects._nvvm_ops_gen.wmma_mma mlir.dialects._nvvm_ops_gen.wmma_store mlir.dialects._nvvm_ops_gen.read_ptx_sreg_nwarpid mlir.dialects._nvvm_ops_gen.read_ptx_sreg_warpid mlir.dialects._nvvm_ops_gen.read_ptx_sreg_warpsize mlir.dialects._nvvm_ops_gen.wgmma_fence_aligned mlir.dialects._nvvm_ops_gen.wgmma_commit_group_sync_aligned mlir.dialects._nvvm_ops_gen.wgmma_mma_async mlir.dialects._nvvm_ops_gen.wgmma_wait_group_sync_aligned Module Contents --------------- .. py:data:: _ods_ir .. py:class:: _Dialect(descriptor: object) Bases: :py:obj:`_ods_ir` .. py:attribute:: DIALECT_NAMESPACE :value: 'nvvm' .. py:class:: Barrier0Op(*, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.barrier0`` operation is a convenience operation that performs barrier synchronization and communication within a CTA (Cooperative Thread Array) using barrier ID 0. It is functionally equivalent to ``nvvm.barrier`` or ``nvvm.barrier id=0``. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.barrier0' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:function:: barrier0(*, loc=None, ip=None) -> Barrier0Op .. py:class:: BarrierArriveOp(numberOfThreads, *, barrierId=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Thread that executes this op announces their arrival at the barrier with given id and continue their execution. The default barrier id is 0 that is similar to ``nvvm.barrier`` Op. When ``barrierId`` is not present, the default barrier id is used. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.barrier.arrive' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: barrierId() -> Optional[_ods_ir] .. py:method:: numberOfThreads() -> _ods_ir .. py:function:: barrier_arrive(number_of_threads, *, barrier_id=None, loc=None, ip=None) -> BarrierArriveOp .. py:class:: BarrierOp(res, *, barrierId=None, numberOfThreads=None, reductionOp=None, reductionPredicate=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.barrier`` operation performs barrier synchronization and communication within a CTA (Cooperative Thread Array). It causes executing threads to wait for all non-exited threads participating in the barrier to arrive. The operation takes two optional operands: * ``barrierId``: Specifies a logical barrier resource with value 0 through 15. Each CTA instance has sixteen barriers numbered 0..15. Defaults to 0 if not specified. * ``numberOfThreads``: Specifies the number of threads participating in the barrier. When specified, the value must be a multiple of the warp size. If not specified, all threads in the CTA participate in the barrier. * ``reductionOp``: specifies the reduction operation (``popc``, ``and``, ``or``). * ``reductionPredicate``: specifies the predicate to be used with the ``reductionOp``. The barrier operation guarantees that when the barrier completes, prior memory accesses requested by participating threads are performed relative to all threads participating in the barrier. It also ensures that no new memory access is requested by participating threads before the barrier completes. When a barrier completes, the waiting threads are restarted without delay, and the barrier is reinitialized so that it can be immediately reused. This operation generates an aligned barrier, indicating that all threads in the CTA will execute the same barrier instruction. Behavior is undefined if all threads in the CTA do not reach this instruction. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.barrier' .. py:attribute:: _ODS_OPERAND_SEGMENTS :value: [0, 0, 0] .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: barrierId() -> Optional[_ods_ir] .. py:method:: numberOfThreads() -> Optional[_ods_ir] .. py:method:: reductionPredicate() -> Optional[_ods_ir] .. py:method:: reductionOp() -> Optional[_ods_ir] .. py:method:: res() -> Optional[_ods_ir] .. py:function:: barrier(res, *, barrier_id=None, number_of_threads=None, reduction_op=None, reduction_predicate=None, loc=None, ip=None) -> Union[_ods_ir, _ods_ir, BarrierOp] .. py:class:: BlockDimXOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.ntid.x' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_ntid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: BlockDimYOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.ntid.y' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_ntid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: BlockDimZOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.ntid.z' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_ntid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: BlockIdXOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.ctaid.x' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_ctaid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: BlockIdYOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.ctaid.y' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_ctaid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: BlockIdZOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.ctaid.z' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_ctaid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: BlockInClusterIdXOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.cluster.ctaid.x' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_cluster_ctaid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: BlockInClusterIdYOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.cluster.ctaid.y' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_cluster_ctaid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: BlockInClusterIdZOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.cluster.ctaid.z' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_cluster_ctaid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: Breakpoint(*, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Breakpoint suspends execution of the program for debugging. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.breakpoint' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:function:: breakpoint(*, loc=None, ip=None) -> Breakpoint .. py:class:: BulkStoreOp(addr, size, *, initVal=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Initializes a region of shared memory at the address given by ``addr``. The ``size`` operand specifies the number of bytes to initialize and must be a multiple of 8. The ``initVal`` operand specifies the value to initialize the memory to. The only supported value is 0. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.st.bulk' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: size() -> _ods_ir .. py:method:: initVal() -> _ods_ir .. py:function:: st_bulk(addr, size, *, init_val=None, loc=None, ip=None) -> BulkStoreOp .. py:class:: Clock64Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.clock64' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_clock64(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: ClockOp(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.clock' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_clock(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterArriveOp(*, aligned=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``cluster.arrive`` can be used by the threads within the cluster for synchronization and communication. The ``cluster.arrive`` instruction marks the warps' arrival at the barrier without causing the executing thread to wait for other participating threads. The ``aligned`` attribute, when provided, generates the .aligned version of the PTX instruction. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cluster.arrive' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: aligned() -> bool .. py:function:: cluster_arrive(*, aligned=None, loc=None, ip=None) -> ClusterArriveOp .. py:class:: ClusterArriveRelaxedOp(*, aligned=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``cluster.arrive`` can be used by the threads within the cluster for synchronization and communication. The ``cluster.arrive`` instruction marks the warps' arrival at the barrier without causing the executing thread to wait for other participating threads. The ``aligned`` attribute, when provided, generates the .aligned version of the PTX instruction. The .relaxed qualifier on ``cluster.arrive`` specifies that there are no memory ordering and visibility guarantees provided for the memory accesses performed prior to ``cluster.arrive``. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cluster.arrive.relaxed' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: aligned() -> bool .. py:function:: cluster_arrive_relaxed(*, aligned=None, loc=None, ip=None) -> ClusterArriveRelaxedOp .. py:class:: ClusterDim(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.cluster.nctarank' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_cluster_nctarank(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterDimBlocksXOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.cluster.nctaid.x' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_cluster_nctaid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterDimBlocksYOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.cluster.nctaid.y' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_cluster_nctaid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterDimBlocksZOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.cluster.nctaid.z' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_cluster_nctaid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterDimXOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.nclusterid.x' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_nclusterid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterDimYOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.nclusterid.y' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_nclusterid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterDimZOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.nclusterid.z' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_nclusterid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterId(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.cluster.ctarank' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_cluster_ctarank(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterIdXOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.clusterid.x' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_clusterid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterIdYOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.clusterid.y' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_clusterid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterIdZOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.clusterid.z' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_clusterid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterLaunchControlQueryCancelOp(res, query_type, try_cancel_response, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` ``clusterlaunchcontrol.query.cancel`` queries the response of a ``clusterlaunchcontrol.try.cancel`` operation specified by operand ``try_cancel_response``. Operand ``query_type`` specifies the type of query to perform and can be one of the following: * ``is_canceled`` : Returns true if the try cancel request succeeded, and false otherwise. * ``get_first_cta_id_{x/y/z}`` : Returns the x, y, or z coordinate of the first CTA in the canceled cluster. Behaviour is defined only if the try cancel request succeeded. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.clusterlaunchcontrol.query.cancel' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: try_cancel_response() -> _ods_ir .. py:method:: query_type() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: clusterlaunchcontrol_query_cancel(res, query_type, try_cancel_response, *, loc=None, ip=None) -> _ods_ir .. py:class:: ClusterLaunchControlTryCancelOp(smemAddress, mbarrier, *, multicast=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` ``clusterlaunchcontrol.try.cancel`` requests atomically canceling the launch of a cluster that has not started running yet. It asynchronously writes an opaque response to shared memory indicating whether the operation succeeded or failed. Operand ``smemAddress`` specifies the naturally aligned address of the 16-byte wide shared memory location where the request's response is written. Operand ``mbarrier`` specifies the mbarrier object used to track the completion of the asynchronous operation. If ``multicast`` is specified, the response is asynchronously written to the corresponding local shared memory location (specifed by ``addr``) of each CTA in the requesting cluster. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.clusterlaunchcontrol.try.cancel' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: smemAddress() -> _ods_ir .. py:method:: mbarrier() -> _ods_ir .. py:method:: multicast() -> bool .. py:function:: clusterlaunchcontrol_try_cancel(smem_address, mbarrier, *, multicast=None, loc=None, ip=None) -> ClusterLaunchControlTryCancelOp .. py:class:: ClusterWaitOp(*, aligned=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``cluster.wait`` causes the executing thread to wait for all non-exited threads of the cluster to perform ``cluster.arrive``. The ``aligned`` attribute, when provided, generates the .aligned version of the PTX instruction. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cluster.wait' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: aligned() -> bool .. py:function:: cluster_wait(*, aligned=None, loc=None, ip=None) -> ClusterWaitOp .. py:class:: ConvertBF16x2ToF8x2Op(dst, a, dstTy, *, rnd=None, sat=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This Op converts the given bf16 inputs in a bf16x2 vector to the specified f8 type. The result ``dst`` is represented as an i16 type or as a vector of two i8 types. If ``dst`` is returned as an i16 type, the converted values from ``a`` are packed such that the value converted from the first element of ``a`` is stored in the upper 8 bits of ``dst`` and the value converted from the second element of ``a`` is stored in the lower 8 bits of ``dst``. If ``dst`` is returned as a vector type, each converted value is stored as an i8 element in the vector. The ``rnd`` and ``sat`` attributes specify the rounding and saturation modes respectively. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.bf16x2.to.f8x2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: a() -> _ods_ir .. py:method:: rnd() -> _ods_ir .. py:method:: sat() -> _ods_ir .. py:method:: dstTy() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_bf16x2_to_f8x2(dst, a, dst_ty, *, rnd=None, sat=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF4x2ToF16x2Op(dst, src, srcType, *, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This Op converts the given f4 inputs in a packed i8 to f16. The result ``dst`` is represented as a vector of f16 elements. The ``relu`` attribute, when set, lowers to the '.relu' variant of the cvt instruction." `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f4x2.to.f16x2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: src() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: srcType() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f4x2_to_f16x2(dst, src, src_type, *, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF6x2ToF16x2Op(dst, src, srcType, *, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This Op converts the given f6 inputs in a i8x2 vector to f16. The result ``dst`` is represented as a vector of f16 elements. The ``relu`` attribute, when set, lowers to the '.relu' variant of the cvt instruction." `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f6x2.to.f16x2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: src() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: srcType() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f6x2_to_f16x2(dst, src, src_type, *, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF8x2ToBF16x2Op(dst, src, srcType, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This Op converts the given f8 inputs in a i8x2 vector to bf16. The result ``dst`` is represented as a vector of bf16 elements. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f8x2.to.bf16x2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: src() -> _ods_ir .. py:method:: srcType() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f8x2_to_bf16x2(dst, src, src_type, *, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF8x2ToF16x2Op(dst, src, srcType, *, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This Op converts the given f8 inputs in a i8x2 vector to f16. The result ``dst`` is represented as a vector of f16 elements. The ``relu`` attribute, when set, lowers to the '.relu' variant of the cvt instruction." `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f8x2.to.f16x2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: src() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: srcType() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f8x2_to_f16x2(dst, src, src_type, *, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF16x2ToF8x2Op(dst, a, dstTy, *, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This Op converts the given f16 inputs in an f16x2 vector to the specified f8 type. The result ``dst`` is represented as an i16 type or as a vector of two i8 types. If ``dst`` is returned as an i16 type, the converted values from ``a`` are packed such that the value converted from the first element of ``a`` is stored in the upper 8 bits of ``dst`` and the value converted from the second element of ``a`` is stored in the lower 8 bits of ``dst``. If ``dst`` is returned as a vector type, each converted value is stored as an i8 element in the vector. The ``relu`` attribute, when set, lowers to the '.relu' variant of the cvt instruction. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f16x2.to.f8x2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: a() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: dstTy() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f16x2_to_f8x2(dst, a, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF32x2ToBF16x2Op(dst, src_hi, src_lo, rbits, *, rnd=None, sat=None, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Converts two F32 values to packed bf16x2 format using stochastic rounding (.rs) mode with randomness provided by the ``rbits`` parameter. The ``relu`` attribute clamps negative results to 0. The ``sat`` attribute determines saturation behavior. The ``src_hi`` and ``src_lo`` parameters correspond to operands ``a`` and ``b`` in the PTX ISA, respectively. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f32x2.to.bf16x2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: src_hi() -> _ods_ir .. py:method:: src_lo() -> _ods_ir .. py:method:: rbits() -> _ods_ir .. py:method:: rnd() -> _ods_ir .. py:method:: sat() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f32x2_to_bf16x2(dst, src_hi, src_lo, rbits, *, rnd=None, sat=None, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF32x2ToF4x2Op(dst, a, b, dstTy, *, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This Op converts each of the given float inputs to the specified fp4 type. The result ``dst`` is returned as an i8 type where the converted values are packed such that the value converted from ``a`` is stored in the upper 4 bits of ``dst`` and the value converted from ``b`` is stored in the lower 4 bits of ``dst``. The ``relu`` attribute, when set, lowers to the '.relu' variant of the cvt instruction. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f32x2.to.f4x2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: a() -> _ods_ir .. py:method:: b() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: dstTy() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f32x2_to_f4x2(dst, a, b, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF32x2ToF6x2Op(dst, a, b, dstTy, *, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This Op converts each of the given float inputs to the specified fp6 type. The result ``dst`` is represented either as an i16 type or as a vector of two i8 types. If ``dst`` is returned as an i16 type, the converted values are packed such that the value converted from ``a`` is stored in the upper 8 bits of ``dst`` with 2 MSB bits padded with zeros and the value converted from ``b`` is stored in the lower 8 bits of ``dst`` with 2 MSB bits padded with zeros. If ``dst`` is returned as a vector type, each converted value is stored as an i8 element in the vector. The ``relu`` attribute, when set, lowers to the '.relu' variant of the cvt instruction. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f32x2.to.f6x2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: a() -> _ods_ir .. py:method:: b() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: dstTy() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f32x2_to_f6x2(dst, a, b, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF32x2ToF8x2Op(dst, a, b, dstTy, *, rnd=None, sat=None, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This Op converts each of the given float inputs to the specified fp8 type. The result ``dst`` is represented as an i16 type or as a vector of two i8 types. If ``dst`` is returned as an i16 type, the converted values are packed such that the value converted from ``a`` is stored in the upper 8 bits of ``dst`` and the value converted from ``b`` is stored in the lower 8 bits of ``dst``. If ``dst`` is returned as a vector type, each converted value is stored as an i8 element in the vector. The ``rnd`` and ``sat`` attributes specify the rounding and saturation modes respectively. The ``relu`` attribute, when set, lowers to the '.relu' variant of the cvt instruction. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f32x2.to.f8x2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: a() -> _ods_ir .. py:method:: b() -> _ods_ir .. py:method:: rnd() -> _ods_ir .. py:method:: sat() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: dstTy() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f32x2_to_f8x2(dst, a, b, dst_ty, *, rnd=None, sat=None, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF32x2ToF16x2Op(dst, src_hi, src_lo, rbits, *, rnd=None, sat=None, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Converts two F32 values to packed f16x2 format using stochastic rounding (.rs) mode with randomness provided by the ``rbits`` parameter. The ``relu`` attribute clamps negative results to 0. The ``sat`` attribute determines saturation behavior. The ``src_hi`` and ``src_lo`` parameters correspond to operands ``a`` and ``b`` in the PTX ISA, respectively. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f32x2.to.f16x2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: src_hi() -> _ods_ir .. py:method:: src_lo() -> _ods_ir .. py:method:: rbits() -> _ods_ir .. py:method:: rnd() -> _ods_ir .. py:method:: sat() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f32x2_to_f16x2(dst, src_hi, src_lo, rbits, *, rnd=None, sat=None, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF32x4ToF4x4Op(dst, src, rbits, dstTy, *, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Converts a vector<4xf32> to packed f4x4 format using stochastic rounding (.rs) mode with SATFINITE saturation. Randomness is provided by the ``rbits`` parameter. The ``dstTy`` attribute specifies the target floating-point format. The ``relu`` attribute clamps negative results to 0. Note: These operations always use RS rounding mode and SATFINITE saturation mode. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f32x4.to.f4x4' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: src() -> _ods_ir .. py:method:: rbits() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: dstTy() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f32x4_to_f4x4(dst, src, rbits, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF32x4ToF6x4Op(dst, src, rbits, dstTy, *, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Converts a vector<4xf32> to packed f6x4 format using stochastic rounding (.rs) mode with SATFINITE saturation. Randomness is provided by the ``rbits`` parameter. The ``dstTy`` attribute specifies the target floating-point format. The ``relu`` attribute clamps negative results to 0. Note: These operations always use RS rounding mode and SATFINITE saturation mode. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f32x4.to.f6x4' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: src() -> _ods_ir .. py:method:: rbits() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: dstTy() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f32x4_to_f6x4(dst, src, rbits, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertF32x4ToF8x4Op(dst, src, rbits, dstTy, *, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Converts a vector<4xf32> to packed f8x4 format using stochastic rounding (.rs) mode with SATFINITE saturation. Randomness is provided by the ``rbits`` parameter. The ``dstTy`` attribute specifies the target floating-point format. The ``relu`` attribute clamps negative results to 0. Note: These operations always use RS rounding mode and SATFINITE saturation mode. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.f32x4.to.f8x4' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: src() -> _ods_ir .. py:method:: rbits() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: dstTy() -> _ods_ir .. py:method:: dst() -> _ods_ir .. py:function:: convert_f32x4_to_f8x4(dst, src, rbits, dst_ty, *, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: ConvertFloatToTF32Op(res, src, *, rnd=None, sat=None, relu=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This Op converts the given f32 input to tf32. The result ``res`` is represented as an i32 type. The ``relu`` attribute, when set, lowers to the '.relu' variant of the cvt instruction. The ``rnd`` and ``sat`` attributes specify the the rounding and saturation modes respectively. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.convert.float.to.tf32' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: src() -> _ods_ir .. py:method:: rnd() -> _ods_ir .. py:method:: sat() -> _ods_ir .. py:method:: relu() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: convert_float_to_tf32(res, src, *, rnd=None, sat=None, relu=None, loc=None, ip=None) -> _ods_ir .. py:class:: CpAsyncBulkCommitGroupOp(*, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This Op commits all prior initiated but uncommitted cp.async.bulk instructions into a cp.async.bulk-group. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.bulk.commit.group' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:function:: cp_async_bulk_commit_group(*, loc=None, ip=None) -> CpAsyncBulkCommitGroupOp .. py:class:: CpAsyncBulkGlobalToSharedClusterOp(dstMem, srcMem, mbar, size, *, multicastMask=None, l2CacheHint=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Initiates an asynchronous copy operation from global memory to cluster's shared memory. The ``multicastMask`` operand is optional. When it is present, the Op copies data from global memory to shared memory of multiple CTAs in the cluster. Operand ``multicastMask`` specifies the destination CTAs in the cluster such that each bit position in the 16-bit ``multicastMask`` operand corresponds to the ``nvvm.read.ptx.sreg.ctaid`` of the destination CTA. The ``l2CacheHint`` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.bulk.shared.cluster.global' .. py:attribute:: _ODS_OPERAND_SEGMENTS :value: [1, 1, 1, 1, 0, 0] .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: dstMem() -> _ods_ir .. py:method:: srcMem() -> _ods_ir .. py:method:: mbar() -> _ods_ir .. py:method:: size() -> _ods_ir .. py:method:: multicastMask() -> Optional[_ods_ir] .. py:method:: l2CacheHint() -> Optional[_ods_ir] .. py:function:: cp_async_bulk_shared_cluster_global(dst_mem, src_mem, mbar, size, *, multicast_mask=None, l2_cache_hint=None, loc=None, ip=None) -> CpAsyncBulkGlobalToSharedClusterOp .. py:class:: CpAsyncBulkPrefetchOp(srcMem, size, *, l2CacheHint=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Initiates an asynchronous prefetch of data from the location specified by ``srcMem`` to the L2 cache. The ``l2CacheHint`` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. Example: .. code:: mlir nvvm.cp.async.bulk.prefetch %src, %size : !llvm.ptr<1> // with l2_cache_hint nvvm.cp.async.bulk.prefetch %src, %size l2_cache_hint = %ch : !llvm.ptr<1> `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.bulk.prefetch' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: srcMem() -> _ods_ir .. py:method:: size() -> _ods_ir .. py:method:: l2CacheHint() -> Optional[_ods_ir] .. py:function:: cp_async_bulk_prefetch(src_mem, size, *, l2_cache_hint=None, loc=None, ip=None) -> CpAsyncBulkPrefetchOp .. py:class:: CpAsyncBulkSharedCTAToGlobalOp(dstMem, srcMem, size, *, l2CacheHint=None, byteMask=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Initiates an asynchronous copy operation from Shared CTA memory to global memory. The 32-bit operand ``size`` specifies the amount of memory to be copied, in terms of number of bytes. ``size`` must be a multiple of 16. The ``l2CacheHint`` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. The ``byteMask`` operand is optional. The i-th bit in the 16-bit wide ``byteMask`` specifies whether the i-th byte of each 16-byte wide chunk of source data is copied to the destination. If the bit is set, the byte is copied. Example: .. code:: mlir nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size : !llvm.ptr<1>, !llvm.ptr<3> // with l2_cache_hint nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size l2_cache_hint = %ch : !llvm.ptr<1>, !llvm.ptr<3> // with byte_mask nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size byte_mask = %mask : !llvm.ptr<1>, !llvm.ptr<3> // with both l2_cache_hint and byte_mask nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size l2_cache_hint = %ch byte_mask = %mask : !llvm.ptr<1>, !llvm.ptr<3> `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.bulk.global.shared.cta' .. py:attribute:: _ODS_OPERAND_SEGMENTS :value: [1, 1, 1, 0, 0] .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: dstMem() -> _ods_ir .. py:method:: srcMem() -> _ods_ir .. py:method:: size() -> _ods_ir .. py:method:: l2CacheHint() -> Optional[_ods_ir] .. py:method:: byteMask() -> Optional[_ods_ir] .. py:function:: cp_async_bulk_global_shared_cta(dst_mem, src_mem, size, *, l2_cache_hint=None, byte_mask=None, loc=None, ip=None) -> CpAsyncBulkSharedCTAToGlobalOp .. py:class:: CpAsyncBulkSharedCTAToSharedClusterOp(dstMem, srcMem, mbar, size, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Initiates an asynchronous copy operation from Shared CTA memory to Shared cluster memory. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.bulk.shared.cluster.shared.cta' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: dstMem() -> _ods_ir .. py:method:: srcMem() -> _ods_ir .. py:method:: mbar() -> _ods_ir .. py:method:: size() -> _ods_ir .. py:function:: cp_async_bulk_shared_cluster_shared_cta(dst_mem, src_mem, mbar, size, *, loc=None, ip=None) -> CpAsyncBulkSharedCTAToSharedClusterOp .. py:class:: CpAsyncBulkTensorGlobalToSharedClusterOp(dstMem, tmaDescriptor, coordinates, mbar, im2colOffsets, *, multicastMask=None, l2CacheHint=None, mode=None, isCTAOnly=None, group=None, predicate=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Initiates an asynchronous copy operation on the tensor data from global memory to shared::cluster (or) shared::cta memory. This Op supports all the load modes specified in ``TMALoadMode``. The ``multicastMask`` operand is optional. When it is present, the Op copies data from global memory to shared memory of multiple CTAs in the cluster. Operand ``multicastMask`` specifies the destination CTAs in the cluster such that each bit position in the 16-bit ``multicastMask`` operand corresponds to the ``nvvm.read.ptx.sreg.ctaid`` of the destination CTA. The ``l2CacheHint`` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. When the ``isCTAOnly`` attribute is set to true, the destination is shared::cta only. Hence, ``multicastMask`` and ``CTAGroup`` are not applicable when ``isCTAOnly`` is true. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.bulk.tensor.shared.cluster.global' .. py:attribute:: _ODS_OPERAND_SEGMENTS .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: dstMem() -> _ods_ir .. py:method:: tmaDescriptor() -> _ods_ir .. py:method:: coordinates() -> _ods_ir .. py:method:: mbar() -> _ods_ir .. py:method:: im2colOffsets() -> _ods_ir .. py:method:: multicastMask() -> Optional[_ods_ir] .. py:method:: l2CacheHint() -> Optional[_ods_ir] .. py:method:: predicate() -> Optional[_ods_ir] .. py:method:: mode() -> _ods_ir .. py:method:: isCTAOnly() -> _ods_ir .. py:method:: group() -> Optional[_ods_ir] .. py:function:: cp_async_bulk_tensor_shared_cluster_global(dst_mem, tma_descriptor, coordinates, mbar, im2col_offsets, *, multicast_mask=None, l2_cache_hint=None, mode=None, is_cta_only=None, group=None, predicate=None, loc=None, ip=None) -> CpAsyncBulkTensorGlobalToSharedClusterOp .. py:class:: CpAsyncBulkTensorPrefetchOp(tmaDescriptor, coordinates, im2colOffsets, *, mode=None, l2CacheHint=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Initiates an asynchronous prefetch operation on the tensor data from global memory to L2 cache. This Op supports all the load modes specified in ``TMALoadMode``. The ``l2CacheHint`` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.bulk.tensor.prefetch' .. py:attribute:: _ODS_OPERAND_SEGMENTS .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: tmaDescriptor() -> _ods_ir .. py:method:: coordinates() -> _ods_ir .. py:method:: im2colOffsets() -> _ods_ir .. py:method:: l2CacheHint() -> Optional[_ods_ir] .. py:method:: mode() -> _ods_ir .. py:function:: cp_async_bulk_tensor_prefetch(tma_descriptor, coordinates, im2col_offsets, *, mode=None, l2_cache_hint=None, loc=None, ip=None) -> CpAsyncBulkTensorPrefetchOp .. py:class:: CpAsyncBulkTensorReduceOp(tmaDescriptor, srcMem, redKind, coordinates, *, mode=None, l2CacheHint=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Initiates an asynchronous reduction operation of tensor data in global memory with tensor data in shared memory. The ``mode`` attribute indicates whether the copy mode is tile or im2col. The ``redOp`` attribute specifies the reduction operations applied. The supported reduction operations are: {add, min, max, inc, dec, and, or, xor} The ``l2CacheHint`` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.bulk.tensor.reduce' .. py:attribute:: _ODS_OPERAND_SEGMENTS .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: tmaDescriptor() -> _ods_ir .. py:method:: srcMem() -> _ods_ir .. py:method:: coordinates() -> _ods_ir .. py:method:: l2CacheHint() -> Optional[_ods_ir] .. py:method:: redKind() -> _ods_ir .. py:method:: mode() -> _ods_ir .. py:function:: cp_async_bulk_tensor_reduce(tma_descriptor, src_mem, red_kind, coordinates, *, mode=None, l2_cache_hint=None, loc=None, ip=None) -> CpAsyncBulkTensorReduceOp .. py:class:: CpAsyncBulkTensorSharedCTAToGlobalOp(tmaDescriptor, srcMem, coordinates, *, l2CacheHint=None, mode=None, predicate=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Initiates an asynchronous copy of the tensor data from shared::cta memory to global memory. This Op supports all the store modes specified in ``TMAStoreMode``. The ``l2CacheHint`` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.bulk.tensor.global.shared.cta' .. py:attribute:: _ODS_OPERAND_SEGMENTS .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: tmaDescriptor() -> _ods_ir .. py:method:: srcMem() -> _ods_ir .. py:method:: coordinates() -> _ods_ir .. py:method:: l2CacheHint() -> Optional[_ods_ir] .. py:method:: predicate() -> Optional[_ods_ir] .. py:method:: mode() -> _ods_ir .. py:function:: cp_async_bulk_tensor_global_shared_cta(tma_descriptor, src_mem, coordinates, *, l2_cache_hint=None, mode=None, predicate=None, loc=None, ip=None) -> CpAsyncBulkTensorSharedCTAToGlobalOp .. py:class:: CpAsyncBulkWaitGroupOp(group, *, read=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Op waits for completion of the most recent bulk async-groups. The ``$group`` operand tells waiting has to be done until for $group or fewer of the most recent bulk async-groups. If ``$group`` is 0, the op wait until all the most recent bulk async-groups have completed. The ``$read`` indicates that the waiting has to be done until all the bulk async operations in the specified bulk async-group have completed reading from their source locations. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.bulk.wait_group' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: group() -> _ods_ir .. py:method:: read() -> bool .. py:function:: cp_async_bulk_wait_group(group, *, read=None, loc=None, ip=None) -> CpAsyncBulkWaitGroupOp .. py:class:: CpAsyncCommitGroupOp(*, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.commit.group' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:function:: cp_async_commit_group(*, loc=None, ip=None) -> CpAsyncCommitGroupOp .. py:class:: CpAsyncMBarrierArriveOp(addr, *, noinc=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``cp.async.mbarrier.arrive`` Op makes the *mbarrier object* track all prior cp.async operations initiated by the executing thread. The ``addr`` operand specifies the address of the *mbarrier object* in generic or shared::cta address space. When it is generic, the underlying memory should fall within the shared::cta space; otherwise the behavior is undefined. The ``noinc`` attr impacts how the mbarrier's state is updated. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.mbarrier.arrive' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: noinc() -> _ods_ir .. py:function:: cp_async_mbarrier_arrive(addr, *, noinc=None, loc=None, ip=None) -> CpAsyncMBarrierArriveOp .. py:class:: CpAsyncOp(dst, src, size, modifier, *, cpSize=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.shared.global' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: dst() -> _ods_ir .. py:method:: src() -> _ods_ir .. py:method:: cpSize() -> Optional[_ods_ir] .. py:method:: size() -> _ods_ir .. py:method:: modifier() -> _ods_ir .. py:function:: cp_async_shared_global(dst, src, size, modifier, *, cp_size=None, loc=None, ip=None) -> CpAsyncOp .. py:class:: CpAsyncWaitGroupOp(n, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.cp.async.wait.group' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: n() -> _ods_ir .. py:function:: cp_async_wait_group(n, *, loc=None, ip=None) -> CpAsyncWaitGroupOp .. py:class:: DotAccumulate2WayOp(res, a, a_type, b, b_type, c, b_hi, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Performs a two-way 16-bit to 8-bit dot-product which is accumulated in a 32-bit result. Operand ``a`` is a vector of two 16-bit elements and operand ``b`` a vector of four 8-bit elements between which the dot product is computed. The ``a_type`` and ``b_type`` attributes specify the type of the elements in ``a`` and ``b`` respectively. If ``a_type`` or ``b_type`` is ``s``, then the elements in the corresponding vector are sign-extended to 32-bit before the dot product is computed. If ``a_type`` or ``b_type`` is ``u``, then the elements in the corresponding vector are zero-extended to 32-bit instead. The ``b_hi`` boolean attribute specifies which two bytes of ``b`` are used for the dot product. If ``b_hi`` is true, then the dot product is computed between ``a`` and elements at indices 2 and 3 of ``b``. If ``b_hi`` is false, then the dot product is computed between ``a`` and elements at indices 0 and 1 of ``b``. Operand ``c`` is a 32-bit integer to which the result is accumulated. It is treated as holding a signed integer if any of ``a_type`` or ``b_type`` is signed. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.dot.accumulate.2way' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: a() -> _ods_ir .. py:method:: b() -> _ods_ir .. py:method:: c() -> _ods_ir .. py:method:: a_type() -> _ods_ir .. py:method:: b_type() -> _ods_ir .. py:method:: b_hi() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: dot_accumulate_2way(res, a, a_type, b, b_type, c, b_hi, *, loc=None, ip=None) -> _ods_ir .. py:class:: DotAccumulate4WayOp(res, a, a_type, b, b_type, c, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Performs a four-way byte dot-product which is accumulated in a 32-bit result. Operand ``a`` and ``b`` are vectors of 4 bytes between which the dot product is computed. The ``a_type`` and ``b_type`` attributes specify the type of the elements in ``a`` and ``b`` respectively. If ``a_type`` or ``b_type`` is ``signed``, then the elements in the corresponding vector are sign-extended to 32-bit before the dot product is computed. If ``a_type`` or ``b_type`` is ``unsigned``, then the elements in the corresponding vector are zero-extended to 32-bit instead. Operand ``c`` is a 32-bit integer to which the result is accumulated. It is treated as holding a signed integer if any of ``a_type`` or ``b_type`` is ``s8``. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.dot.accumulate.4way' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: a() -> _ods_ir .. py:method:: b() -> _ods_ir .. py:method:: c() -> _ods_ir .. py:method:: a_type() -> _ods_ir .. py:method:: b_type() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: dot_accumulate_4way(res, a, a_type, b, b_type, c, *, loc=None, ip=None) -> _ods_ir .. py:class:: ElectSyncOp(pred, *, membermask=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``elect.sync`` instruction elects one predicated active leader thread from among a set of threads specified in the ``membermask``. When the ``membermask`` is not provided explicitly, a default value of ``0xFFFFFFFF`` is used. The predicate result is set to ``True`` for the leader thread, and ``False`` for all other threads. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.elect.sync' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: membermask() -> Optional[_ods_ir] .. py:method:: pred() -> _ods_ir .. py:function:: elect_sync(pred, *, membermask=None, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg0Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg0' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg0(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg1Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg1' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg1(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg2Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg2' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg2(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg3Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg3' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg3(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg4Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg4' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg4(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg5Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg5' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg5(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg6Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg6' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg6(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg7Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg7' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg7(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg8Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg8' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg8(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg9Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg9' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg9(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg10Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg10' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg10(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg11Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg11' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg11(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg12Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg12' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg12(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg13Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg13' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg13(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg14Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg14' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg14(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg15Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg15' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg15(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg16Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg16' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg16(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg17Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg17' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg17(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg18Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg18' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg18(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg19Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg19' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg19(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg20Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg20' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg20(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg21Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg21' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg21(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg22Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg22' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg22(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg23Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg23' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg23(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg24Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg24' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg24(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg25Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg25' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg25(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg26Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg26' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg26(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg27Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg27' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg27(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg28Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg28' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg28(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg29Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg29' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg29(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg30Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg30' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg30(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: EnvReg31Op(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.envreg31' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_envreg31(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: Exit(*, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Ends execution of a thread. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.exit' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:function:: exit(*, loc=None, ip=None) -> Exit .. py:class:: FenceMbarrierInitOp(*, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Fence operation that applies on the prior nvvm.mbarrier.init `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.fence.mbarrier.init' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:function:: fence_mbarrier_init(*, loc=None, ip=None) -> FenceMbarrierInitOp .. py:class:: FenceProxyAcquireOp(scope, addr, size, *, fromProxy=None, toProxy=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` ``fence.proxy.acquire`` is a uni-directional fence used to establish ordering between a prior memory access performed via the generic proxy and a subsequent memory access performed via the tensormap proxy The address operand ``addr`` and the operand ``size`` together specify the memory range ``[addr, addr+size)`` on which the ordering guarantees on the memory accesses across the proxies is to be provided. The only supported value for the ``size`` operand is 128 and must be an immediate. Generic Addressing is used unconditionally, and the address specified by the operand ``addr`` must fall within the ``.global`` state space. Otherwise, the behavior is undefined `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.fence.proxy.acquire' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: size() -> _ods_ir .. py:method:: scope() -> _ods_ir .. py:method:: fromProxy() -> _ods_ir .. py:method:: toProxy() -> _ods_ir .. py:function:: fence_proxy_acquire(scope, addr, size, *, from_proxy=None, to_proxy=None, loc=None, ip=None) -> FenceProxyAcquireOp .. py:class:: FenceProxyOp(kind, *, space=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Fence operation with proxy to establish an ordering between memory accesses that may happen through different proxies. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.fence.proxy' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: kind() -> _ods_ir .. py:method:: space() -> Optional[_ods_ir] .. py:function:: fence_proxy(kind, *, space=None, loc=None, ip=None) -> FenceProxyOp .. py:class:: FenceProxyReleaseOp(scope, *, fromProxy=None, toProxy=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` ``fence.proxy.release`` is a uni-directional fence used to establish ordering between a prior memory access performed via the generic proxy and a subsequent memory access performed via the tensormap proxy. ``fence.proxy.release`` operation can form a release sequence that synchronizes with an acquire sequence that contains the fence.proxy.acquire proxy fence operation `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.fence.proxy.release' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: scope() -> _ods_ir .. py:method:: fromProxy() -> _ods_ir .. py:method:: toProxy() -> _ods_ir .. py:function:: fence_proxy_release(scope, *, from_proxy=None, to_proxy=None, loc=None, ip=None) -> FenceProxyReleaseOp .. py:class:: FenceScClusterOp(*, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.fence.sc.cluster' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:function:: fence_sc_cluster(*, loc=None, ip=None) -> FenceScClusterOp .. py:class:: GlobalTimerLoOp(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.globaltimer.lo' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_globaltimer_lo(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: GlobalTimerOp(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.globaltimer' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_globaltimer(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: GridDimXOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.nctaid.x' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_nctaid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: GridDimYOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.nctaid.y' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_nctaid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: GridDimZOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.nctaid.z' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_nctaid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: GridIdOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.gridid' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_gridid(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: GriddepcontrolOp(kind, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` If the $kind attribute is set to ``wait``, it causes the executing thread to wait until all prerequisite grids in flight have completed and all the memory operations from the prerequisite grids are performed and made visible to the current grid. When the $kind is launch_dependents, it signals that specific dependents the runtime system designated to react to this instruction can be scheduled as soon as all other CTAs in the grid issue the same instruction or have completed. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.griddepcontrol' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: kind() -> _ods_ir .. py:function:: griddepcontrol(kind, *, loc=None, ip=None) -> GriddepcontrolOp .. py:class:: InlinePtxOp(writeOnlyArgs, readOnlyArgs, readWriteArgs, ptxCode, *, predicate=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` This op allows using PTX directly within the NVVM dialect, while greatly simplifying llvm.inline_asm generation. It automatically handles register size selection and sets the correct read/write access for each operand. The operation leverages the ``BasicPtxBuilderInterface`` to abstract away low-level details of PTX assembly formatting. .. code:: The `predicate` attribute is used to specify a predicate for the PTX instruction. Example 1: Read-only Parameters ```mlir nvvm.inline_ptx "mbarrier.init.b64 [$0], $1;" (%barrier_gen, %count) : !llvm.ptr, i32 // Lowers to: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.init.b64 [$0], $1;", "l,r" %arg0, %arg2 : (!llvm.ptr, i32) -> () ``` Example 2: Read-only and Write-only Parameters ```mlir %0 = nvvm.inline_ptx "ex2.approx.ftz.f32 $0, $1;" (%input) : f32 -> f32 // Lowers to: %0 = llvm.inline_asm has_side_effects asm_dialect = att "ex2.approx.ftz.f32 $0, $1;", "=f,f" %arg0 : (f32) -> f32 ``` Example 3: Predicate Usage ```mlir nvvm.inline_ptx "mbarrier.init.b64 [$0], $1;" (%barrier_gen, %count), predicate = %pred : !llvm.ptr, i32, i1 // Lowers to: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" %arg0, %arg2, %arg3 : (!llvm.ptr, i32, i1) -> () ``` .. py:attribute:: OPERATION_NAME :value: 'nvvm.inline_ptx' .. py:attribute:: _ODS_OPERAND_SEGMENTS .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: readOnlyArgs() -> _ods_ir .. py:method:: readWriteArgs() -> _ods_ir .. py:method:: predicate() -> Optional[_ods_ir] .. py:method:: ptxCode() -> _ods_ir .. py:method:: writeOnlyArgs() -> _ods_ir .. py:function:: inline_ptx(write_only_args, read_only_args, read_write_args, ptx_code, *, predicate=None, loc=None, ip=None) -> Union[_ods_ir, _ods_ir, InlinePtxOp] .. py:class:: LaneIdOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.laneid' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_laneid(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: LaneMaskEqOp(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.lanemask.eq' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_lanemask_eq(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: LaneMaskGeOp(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.lanemask.ge' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_lanemask_ge(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: LaneMaskGtOp(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.lanemask.gt' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_lanemask_gt(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: LaneMaskLeOp(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.lanemask.le' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_lanemask_le(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: LaneMaskLtOp(res, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.lanemask.lt' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_lanemask_lt(res, *, loc=None, ip=None) -> _ods_ir .. py:class:: LdMatrixOp(res, ptr, num, layout, shape, eltType, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.ldmatrix' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: ptr() -> _ods_ir .. py:method:: num() -> _ods_ir .. py:method:: layout() -> _ods_ir .. py:method:: shape() -> _ods_ir .. py:method:: eltType() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: ldmatrix(res, ptr, num, layout, shape, elt_type, *, loc=None, ip=None) -> _ods_ir .. py:class:: MBarrierArriveExpectTxOp(addr, txcount, *, predicate=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.mbarrier.arrive.expect_tx`` operation performs an expect-tx operation followed by an arrive-on operation on the *mbarrier object*. Uses the default ``.release.cta`` semantics. This release pattern establishes memory ordering for operations occurring in program order before this arrive instruction by making operations from the current thread visible to subsequent operations in other threads within the CTA. When other threads perform corresponding acquire operations (like 'mbarrier.test.wait'), they synchronize with this release pattern. This operation first performs an expect-tx operation with the specified transaction count, then performs an arrive-on operation with an implicit count of 1. The expect-tx operation increases the tx-count of the *mbarrier object* by the specified expectCount value, setting the current phase to expect and tracks the completion of additional asynchronous transactions. The operation takes the following operands: * ``addr``: A pointer to the memory location of the *mbarrier object*. Uses generic addressing, but the address must still be in the shared memory space. * ``txcount``: An unsigned integer specifying the expected transaction count for the expect-tx operation. This represents the number of asynchronous transactions expected to complete before the barrier phase completes. * ``predicate``: Optional predicate for conditional execution. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.mbarrier.arrive.expect_tx' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: txcount() -> _ods_ir .. py:method:: predicate() -> Optional[_ods_ir] .. py:function:: mbarrier_arrive_expect_tx(addr, txcount, *, predicate=None, loc=None, ip=None) -> MBarrierArriveExpectTxOp .. py:class:: MBarrierArriveNocompleteOp(res, addr, count, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.mbarrier.arrive.nocomplete`` operation performs an arrive-on operation on the *mbarrier object* with the guarantee that it will not cause the barrier to complete its current phase. Uses the default ``.release.cta`` semantics. This release pattern establishes memory ordering for operations occurring in program order before this arrive instruction by making operations from the current thread visible to subsequent operations in other threads within the CTA. When other threads perform corresponding acquire operations (like 'mbarrier.test.wait'), they synchronize with this release pattern. This operation causes the executing thread to signal its arrival at the barrier with a specified count, but ensures that the barrier phase will not complete as a result of this operation. The operation returns an opaque value that captures the phase of the *mbarrier object* prior to the arrive-on operation. The operation takes the following operands: * ``addr``: A pointer to the memory location of the *mbarrier object*. The ``addr`` must be a pointer to generic or shared::cta memory. When it is generic, the underlying address must be within the shared::cta memory space; otherwise the behavior is undefined. * ``count``: Integer specifying the count argument to the arrive-on operation. Must be in the valid range as specified in the *mbarrier object* contents. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.mbarrier.arrive.nocomplete' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: count() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: mbarrier_arrive_nocomplete(res, addr, count, *, loc=None, ip=None) -> _ods_ir .. py:class:: MBarrierArriveOp(res, addr, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.mbarrier.arrive`` operation performs an arrive-on operation on the *mbarrier object* at the specified address. Uses the default ``.release.cta`` semantics. This release pattern establishes memory ordering for operations occurring in program order before this arrive instruction by making operations from the current thread visible to subsequent operations in other threads within the CTA. When other threads perform corresponding acquire operations (like 'mbarrier.test.wait'), they synchronize with this release pattern. This operation causes the executing thread to signal its arrival at the barrier. The operation returns an opaque value that captures the phase of the *mbarrier object* prior to the arrive-on operation. The contents of this state value are implementation-specific. The operation takes the following operand: * ``addr``: A pointer to the memory location of the *mbarrier object*. The ``addr`` must be a pointer to generic or shared::cta memory. When it is generic, the underlying address must be within the shared::cta memory space; otherwise the behavior is undefined. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.mbarrier.arrive' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: mbarrier_arrive(res, addr, *, loc=None, ip=None) -> _ods_ir .. py:class:: MBarrierInitOp(addr, count, *, predicate=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.mbarrier.init`` operation initializes an *mbarrier object* at the specified memory location. This operation initializes the *mbarrier object* with the following state: * Current phase: 0 * Expected arrival count: ``count`` * Pending arrival count: ``count`` * Transaction count (tx-count): 0 The operation takes the following operands: * ``addr``: A pointer to the memory location of the *mbarrier object*. The ``addr`` must be a pointer to generic or shared::cta memory. When it is generic, the underlying address must be within the shared::cta memory space; otherwise the behavior is undefined. * ``count``: Integer specifying the number of threads that will participate in barrier synchronization. Must be in the range [1, 2²⁰ - 1]. * ``predicate``: Optional predicate for conditional execution. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.mbarrier.init' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: count() -> _ods_ir .. py:method:: predicate() -> Optional[_ods_ir] .. py:function:: mbarrier_init(addr, count, *, predicate=None, loc=None, ip=None) -> MBarrierInitOp .. py:class:: MBarrierInvalOp(addr, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.mbarrier.inval`` operation invalidates an *mbarrier object* at the specified memory location. This operation marks the *mbarrier object* as invalid, making it safe to repurpose the memory location for other uses or to reinitialize it as a new *mbarrier object*. It is undefined behavior if the *mbarrier object* is already invalid. The operation takes the following operand: * ``addr``: A pointer to the memory location of the *mbarrier object*. The ``addr`` must be a pointer to generic or shared::cta memory. When it is generic, the underlying address must be within the shared::cta memory space; otherwise the behavior is undefined. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.mbarrier.inval' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:function:: mbarrier_inval(addr, *, loc=None, ip=None) -> MBarrierInvalOp .. py:class:: MBarrierTestWaitOp(res, addr, state, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.mbarrier.test.wait`` operation performs a non-blocking test for the completion of a specific phase of an *mbarrier object*. It uses the default ``.acquire.cta`` semantics. This acquire pattern establishes memory ordering for operations occurring in program order after this wait instruction by making operations from other threads in the CTA visible to subsequent operations in the current thread. When this wait completes, it synchronizes with the corresponding release pattern from the ``mbarrier.arrive`` operation, establishing memory ordering within the CTA. This operation tests whether the mbarrier phase specified by the state operand has completed. It is a non-blocking instruction that immediately returns the completion status without suspending the executing thread. The operation takes the following operands: * ``addr``: A pointer to the memory location of the *mbarrier object*. Uses generic addressing, but the address must still be in the shared memory space. * ``state``: An opaque value returned by a previous ``mbarrier.arrive`` operation on the same *mbarrier object* during the current or immediately preceding phase. The operation returns a boolean value indicating whether the specified phase has completed: * ``true``: The immediately preceding phase has completed * ``false``: The phase is still incomplete (current phase) **Memory ordering guarantees**: When this wait returns true, the following ordering guarantees hold: #. All memory accesses (except async operations) requested prior to ``mbarrier.arrive`` having release semantics by participating CTA threads are visible to the executing thread. #. All ``cp.async`` operations requested prior to ``cp.async.mbarrier.arrive`` by participating CTA threads are visible to the executing thread. #. All ``cp.async.bulk`` operations using the same *mbarrier object* requested prior to ``mbarrier.arrive`` having release semantics by participating CTA threads are visible to the executing thread. #. Memory accesses requested after this wait are not visible to memory accesses performed prior to ``mbarrier.arrive`` by other participating threads. #. No ordering guarantee exists for memory accesses by the same thread between ``mbarrier.arrive`` and this wait. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.mbarrier.test.wait' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: state() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: mbarrier_test_wait(res, addr, state, *, loc=None, ip=None) -> _ods_ir .. py:class:: MBarrierTryWaitParityOp(addr, phase, ticks, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.mbarrier.try_wait.parity`` operation performs a potentially-blocking test for the completion of a specific phase of an *mbarrier object* using phase parity. It uses the default ``.acquire.cta`` semantics. This acquire pattern establishes memory ordering for operations occurring in program order after this wait instruction by making operations from other threads in the CTA visible to subsequent operations in the current thread. When this wait completes, it synchronizes with the corresponding release pattern from the ``mbarrier.arrive`` operation, establishing memory ordering within the CTA. This operation waits for the completion of the mbarrier phase indicated by the phase parity. While it uses the underlying PTX ``mbarrier.try_wait.parity`` instruction, this MLIR operation generates a loop that enforces the test to complete before continuing execution, ensuring the barrier phase is actually completed rather than potentially timing out. The operation takes the following operands: * ``addr``: A pointer to the memory location of the *mbarrier object*. Uses generic addressing, but the address must still be in the shared memory space. * ``phase``: An integer specifying the phase parity (0 or 1). Even phases have parity 0, odd phases have parity 1. * ``ticks``: An unsigned integer specifying the suspend time hint in nanoseconds. This may be used instead of the system-dependent time limit. **Memory ordering guarantees**: When this wait returns true, the following ordering guarantees hold: #. All memory accesses (except async operations) requested prior to ``mbarrier.arrive`` having release semantics by participating CTA threads are visible to the executing thread. #. All ``cp.async`` operations requested prior to ``cp.async.mbarrier.arrive`` by participating CTA threads are visible to the executing thread. #. All ``cp.async.bulk`` operations using the same *mbarrier object* requested prior to ``mbarrier.arrive`` having release semantics by participating CTA threads are visible to the executing thread. #. Memory accesses requested after this wait are not visible to memory accesses performed prior to ``mbarrier.arrive`` by other participating threads. #. No ordering guarantee exists for memory accesses by the same thread between ``mbarrier.arrive`` and this wait. **Implementation behavior**: This operation generates a PTX loop that repeatedly calls the underlying ``mbarrier.try_wait.parity`` instruction until the barrier phase completes. Unlike the raw PTX instruction which may return without completion after a timeout, this MLIR operation guarantees completion by continuing to loop until the specified phase is reached. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.mbarrier.try_wait.parity' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: phase() -> _ods_ir .. py:method:: ticks() -> _ods_ir .. py:function:: mbarrier_try_wait_parity(addr, phase, ticks, *, loc=None, ip=None) -> MBarrierTryWaitParityOp .. py:class:: MapaOp(res, a, b, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.mapa' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: a() -> _ods_ir .. py:method:: b() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: mapa(res, a, b, *, loc=None, ip=None) -> _ods_ir .. py:class:: MatchSyncOp(res, thread_mask, val, kind, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``match.sync`` op performs broadcast and compare of operand ``val`` across all non-exited threads in ``thread_mask`` and returns a mask depending on the kind and an optional predicate. The matching operation kinds are: * ``any``: Returns a mask corresponding to the non-exited threads in the ``thread_mask`` that have the same value of operand ``val``. * ``all``: Returns a mask and a predicate. If all non-exited threads in the ``thread_mask`` have the same value of operand ``val``, the predicate is set to true and the mask corresponds to the non-exited threads in the ``thread_mask``. Otherwise, the predicate is set to false and the mask is 0. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.match.sync' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: thread_mask() -> _ods_ir .. py:method:: val() -> _ods_ir .. py:method:: kind() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: match_sync(res, thread_mask, val, kind, *, loc=None, ip=None) -> _ods_ir .. py:class:: MembarOp(scope, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` ``membar`` operation guarantees that prior memory accesses requested by this thread are performed at the specified ``scope``, before later memory operations requested by this thread following the membar instruction. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.memory.barrier' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: scope() -> _ods_ir .. py:function:: memory_barrier(scope, *, loc=None, ip=None) -> MembarOp .. py:class:: MmaOp(res, shape, layoutA, layoutB, operandA, operandB, operandC, *, b1Op=None, intOverflowBehavior=None, multiplicandAPtxType=None, multiplicandBPtxType=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.mma.sync`` operation collectively performs the operation ``D = matmul(A, B) + C`` using all threads in a warp. All the threads in the warp must execute the same ``mma.sync`` operation. For each possible multiplicand PTX data type, there are one or more possible instruction shapes given as "mMnNkK". The below table describes the posssibilities as well as the types required for the operands. Note that the data type for C (the accumulator) and D (the result) can vary independently when there are multiple possibilities in the "C/D Type" column. When an optional attribute cannot be immediately inferred from the types of the operands and the result during parsing or validation, an error will be raised. ``b1Op`` is only relevant when the binary (b1) type is given to ``multiplicandDataType``. It specifies how the multiply-and-acumulate is performed and is either ``xor_popc`` or ``and_poc``. The default is ``xor_popc``. ``intOverflowBehavior`` is only relevant when the ``multiplicandType`` attribute is one of ``u8, s8, u4, s4``, this attribute describes how overflow is handled in the accumulator. When the attribute is ``satfinite``, the accumulator values are clamped in the int32 range on overflow. This is the default behavior. Alternatively, accumulator behavior ``wrapped`` can also be specified, in which case overflow wraps from one end of the range to the other. ``layoutA`` and ``layoutB`` are required and should generally be set to ``#nvvm.mma_layout`` and ``#nvvm.mma_layout`` respectively, but other combinations are possible for certain layouts according to the table below. .. code:: | A/B Type | Shape | ALayout | BLayout | A Type | B Type | C/D Type | |----------|-----------|---------|---------|----------|----------|-------------------| | f64 | .m8n8k4 | row | col | 1x f64 | 1x f64 | 2x f64 | | f16 | .m8n8k4 | row/col | row/col | 2x f16x2 | 2x f16x2 | 4x f16x2 or 8xf32 | | | .m16n8k8 | row | col | 2x f16x2 | 1x f16x2 | 2x f16x2 or 4 f32 | | | .m16n8k16 | row | col | 4x f16x2 | 2x f16x2 | 2x f16x2 or 4 f32 | | bf16 | .m16n8k8 | row | col | 2x i32 | 1x i32 | 4x f32 | | | .m16n8k16 | row | col | 4x i32 | 2x i32 | 4x f32 | | tf32 | .m16n8k4 | row | col | 2x i32 | 1x i32 | 4x f32 | | | .m16n8k8 | row | col | 4x i32 | 2x i32 | 2x f16x2 or 4 f32 | | u8/s8 | .m8n8k16 | row | col | 1x i32 | 1x i32 | 2x i32 | | | .m16n8k16 | row | col | 2x i32 | 1x i32 | 4x i32 | | | .m16n8k32 | row | col | 4x i32 | 2x i32 | 4x i32 | | u4/s4 | .m8n8k32 | row | col | 1x i32 | 1x i32 | 2x i32 | | | m16n8k32 | row | col | 2x i32 | 1x i32 | 4x i32 | | | m16n8k64 | row | col | 4x i32 | 2x i32 | 4x i32 | | b1 | m8n8k128 | row | col | 1x i32 | 1x i32 | 2x i32 | | | m16n8k128 | row | col | 2x i32 | 1x i32 | 4x i32 | Example: .. code:: mlir %128 = nvvm.mma.sync A[%120, %121, %122, %123] B[%124, %125] C[%126, %127] {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, shape = {k = 16 : i32, m = 16 : i32, n = 8 : i32}} : (vector<2xf16>, vector<2xf16>, vector<2xf16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)> .. py:attribute:: OPERATION_NAME :value: 'nvvm.mma.sync' .. py:attribute:: _ODS_OPERAND_SEGMENTS .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: operandA() -> _ods_ir .. py:method:: operandB() -> _ods_ir .. py:method:: operandC() -> _ods_ir .. py:method:: shape() -> _ods_ir .. py:method:: b1Op() -> Optional[_ods_ir] .. py:method:: intOverflowBehavior() -> Optional[_ods_ir] .. py:method:: layoutA() -> _ods_ir .. py:method:: layoutB() -> _ods_ir .. py:method:: multiplicandAPtxType() -> Optional[_ods_ir] .. py:method:: multiplicandBPtxType() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: mma_sync(res, shape, layout_a, layout_b, operand_a, operand_b, operand_c, *, b1_op=None, int_overflow_behavior=None, multiplicand_a_ptx_type=None, multiplicand_b_ptx_type=None, loc=None, ip=None) -> _ods_ir .. py:class:: NanosleepOp(duration, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The op suspends the thread for a sleep duration approximately close to the delay ``$duration``, specified in nanoseconds. The sleep duration is approximated, but guaranteed to be in the interval [0, 2*t]. The maximum sleep duration is 1 millisecond. The implementation may reduce the sleep duration for individual threads within a warp such that all sleeping threads in the warp wake up together. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.nanosleep' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: duration() -> _ods_ir .. py:function:: nanosleep(duration, *, loc=None, ip=None) -> NanosleepOp .. py:class:: PMEventOp(*, maskedEventId=None, eventId=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Triggers one or more of a fixed number of performance monitor events, with event index or mask specified by immediate operand. Without ``mask`` it triggers a single performance monitor event indexed by immediate operand a, in the range 0..15. With ``mask`` it triggers one or more of the performance monitor events. Each bit in the 16-bit immediate operand controls an event. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.pmevent' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: maskedEventId() -> Optional[_ods_ir] .. py:method:: eventId() -> Optional[_ods_ir] .. py:function:: pmevent(*, masked_event_id=None, event_id=None, loc=None, ip=None) -> PMEventOp .. py:class:: PrefetchOp(addr, *, cacheLevel=None, evictPriority=None, predicate=None, tensormap=None, uniform=None, in_param_space=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Prefetches the cache line containing the address given by ``addr``. The operand may be a global, local, or generic pointer. When ``tensormap`` is specified, the operand may instead be a constant or generic pointer. If the address maps to shared memory, the operation has no effect. At most one of ``cacheLevel`` or ``tensormap`` may be present. The ``cacheLevel`` attribute selects the target cache level. When combined with ``uniform``, the prefetch is performed to the uniform cache, in which case ``addr`` must be a generic pointer. When ``tensormap`` is used, the line containing ``addr`` is brought from the constant or parameter state space for later use by ``cp.async.bulk.tensor``. If ``in_param_space`` is specified, the generic pointer is interpreted as referring to the parameter state space. ``uniform`` can be specified after the ``cacheLevel`` to indicate that the prefetch is performed to the specified uniform cache level. If ``uniform`` is specified, ``addr`` must be a generic address pointer and no operation is performed if ``addr`` maps to a ``const``, ``local``, or ``shared`` memory location. The ``evictPriority`` attribute is optional and specifies the cache eviction priority when ``cacheLevel`` is L2. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.prefetch' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: predicate() -> Optional[_ods_ir] .. py:method:: cacheLevel() -> Optional[_ods_ir] .. py:method:: evictPriority() -> Optional[_ods_ir] .. py:method:: tensormap() -> bool .. py:method:: uniform() -> bool .. py:method:: in_param_space() -> bool .. py:function:: prefetch(addr, *, cache_level=None, evict_priority=None, predicate=None, tensormap=None, uniform=None, in_param_space=None, loc=None, ip=None) -> PrefetchOp .. py:class:: RcpApproxFtzF32Op(res, arg, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.rcp.approx.ftz.f' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: arg() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: rcp_approx_ftz_f(res, arg, *, loc=None, ip=None) -> _ods_ir .. py:class:: ReduxOp(res, val, kind, mask_and_clamp, *, abs=None, nan=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` ``redux.sync`` performs a reduction operation ``kind`` of the 32 bit source register across all non-exited threads in the membermask. The ``abs`` and ``nan`` attributes can be used in the case of f32 input type, where the ``abs`` attribute causes the absolute value of the input to be used in the reduction operation, and the ``nan`` attribute causes the reduction operation to return NaN if any of the inputs to participating threads are NaN. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.redux.sync' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: val() -> _ods_ir .. py:method:: mask_and_clamp() -> _ods_ir .. py:method:: kind() -> _ods_ir .. py:method:: abs() -> _ods_ir .. py:method:: nan() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: redux_sync(res, val, kind, mask_and_clamp, *, abs=None, nan=None, loc=None, ip=None) -> _ods_ir .. py:class:: SetMaxRegisterOp(regCount, action, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.setmaxregister' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: regCount() -> _ods_ir .. py:method:: action() -> _ods_ir .. py:function:: setmaxregister(reg_count, action, *, loc=None, ip=None) -> SetMaxRegisterOp .. py:class:: ShflOp(res, thread_mask, val, offset, mask_and_clamp, kind, *, return_value_and_is_valid=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``shfl.sync`` Op implements data shuffle within threads of a warp. The ``thread_mask`` denotes the threads participating in the Op where the bit position corresponds to a particular thread's laneid. The ``offset`` specifies a source lane or source lane offset (depending on ``kind``). The ``val`` is the input value to be copied from the source. The ``mask_and_clamp`` contains two packed values specifying a mask for logically splitting warps into sub-segments and an upper bound for clamping the source lane index. The ``return_value_and_is_valid`` unit attribute can be specified to indicate that the return value is a two-element struct, where the first element is the result value and the second element is a predicate indicating if the computed source lane index is valid. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.shfl.sync' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: thread_mask() -> _ods_ir .. py:method:: val() -> _ods_ir .. py:method:: offset() -> _ods_ir .. py:method:: mask_and_clamp() -> _ods_ir .. py:method:: kind() -> _ods_ir .. py:method:: return_value_and_is_valid() -> bool .. py:method:: res() -> _ods_ir .. py:function:: shfl_sync(res, thread_mask, val, offset, mask_and_clamp, kind, *, return_value_and_is_valid=None, loc=None, ip=None) -> _ods_ir .. py:class:: SmDimOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.nsmid' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_nsmid(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: SmIdOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.smid' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_smid(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: StMatrixOp(ptr, sources, layout, shape, eltType, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Collectively store one or more matrices across all threads in a warp to the location indicated by the address operand $ptr in shared memory. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.stmatrix' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: ptr() -> _ods_ir .. py:method:: sources() -> _ods_ir .. py:method:: layout() -> _ods_ir .. py:method:: shape() -> _ods_ir .. py:method:: eltType() -> _ods_ir .. py:function:: stmatrix(ptr, sources, layout, shape, elt_type, *, loc=None, ip=None) -> StMatrixOp .. py:class:: SyncWarpOp(mask, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.bar.warp.sync`` operation performs barrier synchronization for threads within a warp. This operation causes the executing thread to wait until all threads corresponding to the ``mask`` operand have executed a ``bar.warp.sync`` with the same mask value before resuming execution. The ``mask`` operand specifies the threads participating in the barrier, where each bit position corresponds to the thread's lane ID within the warp. Only threads with their corresponding bit set in the mask participate in the barrier synchronization. **Important constraints**: * The behavior is undefined if the executing thread is not included in the mask (i.e., the bit corresponding to the thread's lane ID is not set) * For compute capability sm_6x or below, all threads in the mask must execute the same ``bar.warp.sync`` instruction in convergence This operation also guarantees memory ordering among participating threads. Threads within the warp that wish to communicate via memory can store to memory, execute ``bar.warp.sync``, and then safely read values stored by other threads in the warp. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.bar.warp.sync' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: mask() -> _ods_ir .. py:function:: bar_warp_sync(mask, *, loc=None, ip=None) -> SyncWarpOp .. py:class:: Tcgen05AllocOp(addr, nCols, *, group=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``tcgen05.alloc`` Op allocates tensor core memory for the amount specified by ``nCols`` and writes the destination address to the ``addr`` argument. The ``nCols`` operand specifies the number of columns to be allocated and it must be a power-of-two. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.tcgen05.alloc' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: nCols() -> _ods_ir .. py:method:: group() -> _ods_ir .. py:function:: tcgen05_alloc(addr, n_cols, *, group=None, loc=None, ip=None) -> Tcgen05AllocOp .. py:class:: Tcgen05CommitOp(addr, *, multicastMask=None, group=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``tcgen05.commit`` makes the *mbarrier object*, specified by the operand ``addr``, track the completion of all the prior async-tcgen05 operations initiated by the executing thread. The multicast variants allow signaling on the *mbarrier objects* of multiple CTAs within the cluster. Operand ``multicastMask``, when present, specifies the destination CTAs in the cluster such that each bit position in the 16-bit ``multicastMask`` operand corresponds to the ``nvvm.read.ptx.sreg.ctaid`` of the destination CTA. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.tcgen05.commit' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: addr() -> _ods_ir .. py:method:: multicastMask() -> Optional[_ods_ir] .. py:method:: group() -> _ods_ir .. py:function:: tcgen05_commit(addr, *, multicast_mask=None, group=None, loc=None, ip=None) -> Tcgen05CommitOp .. py:class:: Tcgen05CpOp(shape, taddr, smem_desc, *, group=None, multicast=None, srcFormat=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Instruction tcgen05.cp initiates an asynchronous copy operation from shared memory to the location specified by the address operand ``taddr`` in the Tensor Memory. The 64-bit register operand ``smem_desc`` specifies the matrix descriptor representing the source matrix in the shared memory that needs to be copied. Example: .. code:: mlir nvvm.tcgen05.cp %taddr, %smem_desc { group = #nvvm.tcgen05_group, shape = #nvvm.tcgen05_cp_shape, multicast = #nvvm.tcgen05_cp_multicast, srcFormat = #nvvm.tcgen05_cp_src_fmt } `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.tcgen05.cp' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: taddr() -> _ods_ir .. py:method:: smem_desc() -> _ods_ir .. py:method:: shape() -> _ods_ir .. py:method:: group() -> _ods_ir .. py:method:: multicast() -> _ods_ir .. py:method:: srcFormat() -> Optional[_ods_ir] .. py:function:: tcgen05_cp(shape, taddr, smem_desc, *, group=None, multicast=None, src_format=None, loc=None, ip=None) -> Tcgen05CpOp .. py:class:: Tcgen05DeallocOp(taddr, nCols, *, group=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``tcgen05.dealloc`` Op de-allocates the tensor core memory specified by ``tmemAddr``, which must be from a previous tensor memory allocation. The ``nCols`` operand specifies the number of columns to be de-allocated, and it must be a power-of-two. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.tcgen05.dealloc' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: taddr() -> _ods_ir .. py:method:: nCols() -> _ods_ir .. py:method:: group() -> _ods_ir .. py:function:: tcgen05_dealloc(taddr, n_cols, *, group=None, loc=None, ip=None) -> Tcgen05DeallocOp .. py:class:: Tcgen05FenceOp(kind, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``tcgen05.fence`` orders all prior async tcgen05 operations with respect to the subsequent tcgen05 and execution ordering operations. The ``tcgen05.fence`` orders all subsequent async tcgen05 operations with respect to the prior tcgen05 and execution ordering operations. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.tcgen05.fence' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: kind() -> _ods_ir .. py:function:: tcgen05_fence(kind, *, loc=None, ip=None) -> Tcgen05FenceOp .. py:class:: Tcgen05LdOp(res, shape, tmemAddr, *, pack=None, offset=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Instruction ``tcgen05.ld`` asynchronously loads data from the Tensor Memory at the location specified by the 32-bit address operand ``tmemAddr`` into the destination register ``res``, collectively across all threads of the warps. The ``shape`` and the ``num`` attribute together determines the total dimension of the data which is loaded from the Tensor Memory. The ``shape`` attribute indicates the base dimension of data to be accessed as described in the Data Movement Shape. The ``num`` attribute indicates the repeat factor on the base dimension resulting in the total dimension of the data that is accessed. The shape ``16x32bx2`` performs two accesses into Tensor Memory of the shape ``16x32b``. The base address of the first access is specified by ``tmemAddr`` and the base address of the second access is specified by ``tmemAddr + offset``, where ``offset`` is an immediate argument. The unit attribute ``pack`` can be used to pack two 16-bit elements from adjacent columns into a single 32-bit element during the load. The following table describes the size of the vector for various combinations of ``num`` and ``shape`` attributes: .. code:: |=====================================================================| | num/shape | 16x32bx2/16x64b/32x32b | 16x128b | 16x256b | |=====================================================================| | x1 | 1 | 2 | 4 | | x2 | 2 | 4 | 8 | | x4 | 4 | 8 | 16 | | x8 | 8 | 16 | 32 | | x16 | 16 | 32 | 64 | | x32 | 32 | 64 | 128 | | x64 | 64 | 128 | NA | | x128 | 128 | NA | NA | |=====================================================================| Example: .. code:: mlir nvvm.tcgen05.ld %tmemAddr, %offset pack { shape = #nvvm.tcgen05_ldst_shape, } : <2xi32> `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.tcgen05.ld' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: tmemAddr() -> _ods_ir .. py:method:: offset() -> Optional[_ods_ir] .. py:method:: pack() -> bool .. py:method:: shape() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: tcgen05_ld(res, shape, tmem_addr, *, pack=None, offset=None, loc=None, ip=None) -> _ods_ir .. py:class:: Tcgen05MmaSmemDescOp(res, startAddr, leadingDimOffset, strideDimOffset, baseOffset, leadingDimMode, swizzleMode, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``nvvm.tcgen05_mma_smem_desc`` constructs a Shared Memory descriptor for tcgen05.mma. This descriptor is a 64-bit value which describes the properties of multiplicand matrix in shared memory including its location in the shared memory of the current CTA. .. code:: +-----------+------+------------------------------------------------------+ | Bit-field | Size | Description | +-----------+------+------------------------------------------------------+ | 0-13 | 14 | Matrix start address | | 14-15 | 2 | Reserved | | 16-29 | 14 | Leading dim relative-offset (or) absolute-address | | 30-31 | 2 | Reserved | | 32-45 | 14 | Stride dimension byte offset | | 46-48 | 3 | Fixed constant value of 0b001 | | 49-51 | 3 | Matrix base offset | | 52 | 1 | Leading dimension stride mode: | | | | 0: byte offset relative | | | | 1: byte address absolute | | 53-60 | 8 | Fixed constant value of 0xb00000000 | | 61-63 | 3 | Swizzling mode: | | | | 0: No swizzling | | | | 1: 128-Byte with 32B atomic swizzling | | | | 2: 128-Byte swizzling | | | | 4: 64-Byte swizzling | | | | 6: 32-Byte swizzling | | | | (Values 3, 5 and 7 are invalid) | +-----------+------+------------------------------------------------------+ Example: .. code:: mlir %desc = nvvm.tcgen05.mma_smem_desc (%startAddr, %leadingDimOffset, %strideDimOffset, %baseOffset, %leadingDimMode, %swizzleMode) : (i32, i32, i32, i8, i1, i8) -> i64 `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.tcgen05.mma_smem_desc' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: startAddr() -> _ods_ir .. py:method:: leadingDimOffset() -> _ods_ir .. py:method:: strideDimOffset() -> _ods_ir .. py:method:: baseOffset() -> _ods_ir .. py:method:: leadingDimMode() -> _ods_ir .. py:method:: swizzleMode() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: tcgen05_mma_smem_desc(res, start_addr, leading_dim_offset, stride_dim_offset, base_offset, leading_dim_mode, swizzle_mode, *, loc=None, ip=None) -> _ods_ir .. py:class:: Tcgen05RelinquishAllocPermitOp(*, group=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``tcgen05.relinquish_alloc_permit`` Op specifies that the CTA of the executing thread is relinquishing the right to allocate Tensor Memory. So, it is illegal for a CTA to perform ``tcgen05.alloc`` after any of its constituent threads execute ``tcgen05.relinquish_alloc_permit``. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.tcgen05.relinquish_alloc_permit' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: group() -> _ods_ir .. py:function:: tcgen05_relinquish_alloc_permit(*, group=None, loc=None, ip=None) -> Tcgen05RelinquishAllocPermitOp .. py:class:: Tcgen05ShiftOp(taddr, *, group=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``tcgen05.shift`` is an asynchronous instruction which initiates the shifting of 32-byte elements downwards across all the rows, except the last, by one row. The operand ``taddr`` specifies the base address of the matrix in Tensor Memory whose rows must be down shifted. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.tcgen05.shift' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: taddr() -> _ods_ir .. py:method:: group() -> _ods_ir .. py:function:: tcgen05_shift(taddr, *, group=None, loc=None, ip=None) -> Tcgen05ShiftOp .. py:class:: Tcgen05StOp(shape, tmemAddr, val, *, unpack=None, offset=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Instruction ``tcgen05.st`` asynchronously stores data from the source register ``r`` into the Tensor Memory at the location specified by the 32-bit address operand ``tmemAddr``, collectively across all threads of the warps. The ``shape`` and the ``num`` attribute together determines the total dimension of the data which is stored to the Tensor Memory. The ``shape`` indicates the base dimension of data to be accessed. The ``num`` attribute indicates the repeat factor on the base dimension resulting in the total dimension of the data that is accessed. The shape ``16x32bx2`` performs two accesses into Tensor Memory of the shape ``16x32b``. The base address of the first access is specified by ``tmemAddr`` and the base address of the second access is specified by ``tmemAddr + offset``, where ``offset`` is an immediate argument. The unit attribute ``unpack`` can be used to unpack a 32-bit element in the register into two 16-bit elements and store them in adjacent columns. The following table describes the size of the vector for various combinations of ``num`` and ``shape`` attributes: .. code:: |=====================================================================| | num/shape | 16x32bx2/16x64b/32x32b | 16x128b | 16x256b | |=====================================================================| | x1 | 1 | 2 | 4 | | x2 | 2 | 4 | 8 | | x4 | 4 | 8 | 16 | | x8 | 8 | 16 | 32 | | x16 | 16 | 32 | 64 | | x32 | 32 | 64 | 128 | | x64 | 64 | 128 | NA | | x128 | 128 | NA | NA | |=====================================================================| Example: .. code:: mlir nvvm.tcgen05.st %tmemAddr, %val, %offset unpack { shape = #nvvm.tcgen05_ldst_shape, } : <2xi32> `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.tcgen05.st' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: tmemAddr() -> _ods_ir .. py:method:: val() -> _ods_ir .. py:method:: offset() -> Optional[_ods_ir] .. py:method:: unpack() -> bool .. py:method:: shape() -> _ods_ir .. py:function:: tcgen05_st(shape, tmem_addr, val, *, unpack=None, offset=None, loc=None, ip=None) -> Tcgen05StOp .. py:class:: Tcgen05WaitOp(kind, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``tcgen05.wait`` causes the executing thread to block until all prior ``tcgen05.ld`` operations issued by the executing thread have completed. Similarly, the ``tcgen05.wait`` causes the executing thread to block until all prior ``tcgen05.st`` operations issued by the executing thread have completed. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.tcgen05.wait' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: kind() -> _ods_ir .. py:function:: tcgen05_wait(kind, *, loc=None, ip=None) -> Tcgen05WaitOp .. py:class:: ThreadIdXOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.tid.x' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_tid_x(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ThreadIdYOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.tid.y' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_tid_y(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: ThreadIdZOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.tid.z' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_tid_z(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: VoteSyncOp(res, mask, pred, kind, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The ``vote.sync`` op will cause executing thread to wait until all non-exited threads corresponding to membermask have executed ``vote.sync`` with the same qualifiers and same membermask value before resuming execution. The vote operation kinds are: * ``any``: True if source predicate is True for some thread in membermask. * ``all``: True if source predicate is True for all non-exited threads in membermask. * ``uni``: True if source predicate has the same value in all non-exited threads in membermask. * ``ballot``: In the ballot form, the destination result is a 32 bit integer. In this form, the predicate from each thread in membermask are copied into the corresponding bit position of the result, where the bit position corresponds to the thread's lane id. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.vote.sync' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: mask() -> _ods_ir .. py:method:: pred() -> _ods_ir .. py:method:: kind() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: vote_sync(res, mask, pred, kind, *, loc=None, ip=None) -> _ods_ir .. py:class:: WMMALoadOp(res, ptr, stride, m, n, k, layout, eltype, frag, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.wmma.load' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: ptr() -> _ods_ir .. py:method:: stride() -> _ods_ir .. py:method:: m() -> _ods_ir .. py:method:: n() -> _ods_ir .. py:method:: k() -> _ods_ir .. py:method:: layout() -> _ods_ir .. py:method:: eltype() -> _ods_ir .. py:method:: frag() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: wmma_load(res, ptr, stride, m, n, k, layout, eltype, frag, *, loc=None, ip=None) -> _ods_ir .. py:class:: WMMAMmaOp(res, m, n, k, layoutA, layoutB, eltypeA, eltypeB, args, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.wmma.mma' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: args() -> _ods_ir .. py:method:: m() -> _ods_ir .. py:method:: n() -> _ods_ir .. py:method:: k() -> _ods_ir .. py:method:: layoutA() -> _ods_ir .. py:method:: layoutB() -> _ods_ir .. py:method:: eltypeA() -> _ods_ir .. py:method:: eltypeB() -> _ods_ir .. py:method:: res() -> _ods_ir .. py:function:: wmma_mma(res, m, n, k, layout_a, layout_b, eltype_a, eltype_b, args, *, loc=None, ip=None) -> _ods_ir .. py:class:: WMMAStoreOp(ptr, m, n, k, layout, eltype, args, stride, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.wmma.store' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: ptr() -> _ods_ir .. py:method:: args() -> _ods_ir .. py:method:: stride() -> _ods_ir .. py:method:: m() -> _ods_ir .. py:method:: n() -> _ods_ir .. py:method:: k() -> _ods_ir .. py:method:: layout() -> _ods_ir .. py:method:: eltype() -> _ods_ir .. py:function:: wmma_store(ptr, m, n, k, layout, eltype, args, stride, *, loc=None, ip=None) -> WMMAStoreOp .. py:class:: WarpDimOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.nwarpid' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_nwarpid(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: WarpIdOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.warpid' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_warpid(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: WarpSizeOp(res, *, range=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` .. py:attribute:: OPERATION_NAME :value: 'nvvm.read.ptx.sreg.warpsize' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: range() -> Optional[_ods_ir] .. py:method:: res() -> _ods_ir .. py:function:: read_ptx_sreg_warpsize(res, *, range=None, loc=None, ip=None) -> _ods_ir .. py:class:: WgmmaFenceAlignedOp(*, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Enforce an ordering of register accesses between warpgroup level matrix multiplication and other operations. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.wgmma.fence.aligned' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:function:: wgmma_fence_aligned(*, loc=None, ip=None) -> WgmmaFenceAlignedOp .. py:class:: WgmmaGroupSyncAlignedOp(*, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Commits all prior uncommitted warpgroup level matrix multiplication operations. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.wgmma.commit.group.sync.aligned' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:function:: wgmma_commit_group_sync_aligned(*, loc=None, ip=None) -> WgmmaGroupSyncAlignedOp .. py:class:: WgmmaMmaAsyncOp(results_, inouts, descriptorA, descriptorB, shape, typeA, typeB, typeD, scaleD, scaleA, scaleB, layoutA, layoutB, *, satfinite=None, loc=None, ip=None) Bases: :py:obj:`_ods_ir` The warpgroup (128 threads) level matrix multiply and accumulate operation has either of the following forms, where matrix D is called accumulator: D = A * B + D D = A * B, where the input from accumulator D is disabled. Supported shapes: .. code:: |--------------|--------------|------------|--------------|---------------| | | | | |f16+=e4m3*e4m3 | | | | | |f16+=e5m2*e5m2 | |f32+=tf32*tf32|f16+=f16 *f16 | s32+=s8*s8 |s32 += b1 * b1|f16+=e5m2*e4m3 | | |f32+=f16 *f16 | s32+=u8*u8 | |f16+=e4m3*e5m2 | | |f32+=bf16*bf16| s32+=u8*u8 | |f16+=e4m3*e5m2 | | |f32+=bf16*bf16| s32+=s8*u8 | |f32+=e4m3*e4m3 | | | | s32+=u8*s8 | |f32+=e5m2*e5m2 | | | | | |f32+=e4m3*e5m2 | | | | | |f32+=e4m3*e5m2 | |--------------|--------------|------------|--------------|---------------| | .m64n8k8 | .m64n8k16 | .m64n8k32 | .m64n8k256 | .m64n8k32 | | .m64n16k8 | .m64n16k16 | .m64n16k32 | .m64n16k256 | .m64n16k32 | | .m64n24k8 | .m64n24k16 | .m64n24k32 | .m64n24k256 | .m64n24k32 | | .m64n32k8 | .m64n32k16 | .m64n32k32 | .m64n32k256 | .m64n32k32 | | .m64n40k8 | .m64n40k16 | .m64n48k32 | .m64n48k256 | .m64n40k32 | | .m64n48k8 | .m64n48k16 | .m64n64k32 | .m64n64k256 | .m64n48k32 | | .m64n56k8 | .m64n56k16 | .m64n80k32 | .m64n80k256 | .m64n56k32 | | .m64n64k8 | .m64n64k16 | .m64n96k32 | .m64n96k256 | .m64n64k32 | | .m64n72k8 | .m64n72k16 | .m64n112k32| .m64n112k256 | .m64n72k32 | | .m64n80k8 | .m64n80k16 | .m64n128k32| .m64n128k256 | .m64n80k32 | | .m64n88k8 | .m64n88k16 | .m64n144k32| .m64n144k256 | .m64n88k32 | | .m64n96k8 | .m64n96k16 | .m64n160k32| .m64n160k256 | .m64n96k32 | | .m64n104k8 | .m64n104k16 | .m64n176k32| .m64n176k256 | .m64n104k32 | | .m64n112k8 | .m64n112k16 | .m64n192k32| .m64n192k256 | .m64n112k32 | | .m64n120k8 | .m64n120k16 | .m64n208k32| .m64n208k256 | .m64n120k32 | | .m64n128k8 | .m64n128k16 | .m64n224k32| .m64n224k256 | .m64n128k32 | | .m64n136k8 | .m64n136k16 | .m64n240k32| .m64n240k256 | .m64n136k32 | | .m64n144k8 | .m64n144k16 | .m64n256k32| .m64n256k256 | .m64n144k32 | | .m64n152k8 | .m64n152k16 | | | .m64n152k32 | | .m64n160k8 | .m64n160k16 | | | .m64n160k32 | | .m64n168k8 | .m64n168k16 | | | .m64n168k32 | | .m64n176k8 | .m64n176k16 | | | .m64n176k32 | | .m64n184k8 | .m64n184k16 | | | .m64n184k32 | | .m64n192k8 | .m64n192k16 | | | .m64n192k32 | | .m64n200k8 | .m64n200k16 | | | .m64n200k32 | | .m64n208k8 | .m64n208k16 | | | .m64n208k32 | | .m64n216k8 | .m64n216k16 | | | .m64n216k32 | | .m64n224k8 | .m64n224k16 | | | .m64n224k32 | | .m64n232k8 | .m64n232k16 | | | .m64n232k32 | | .m64n240k8 | .m64n240k16 | | | .m64n240k32 | | .m64n248k8 | .m64n248k16 | | | .m64n248k32 | | .m64n256k8 | .m64n256k16 | | | .m64n256k32 | |--------------|--------------|------------|--------------|---------------| `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.wgmma.mma_async' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: inouts() -> _ods_ir .. py:method:: descriptorA() -> _ods_ir .. py:method:: descriptorB() -> _ods_ir .. py:method:: shape() -> _ods_ir .. py:method:: typeA() -> _ods_ir .. py:method:: typeB() -> _ods_ir .. py:method:: typeD() -> _ods_ir .. py:method:: scaleD() -> _ods_ir .. py:method:: scaleA() -> _ods_ir .. py:method:: scaleB() -> _ods_ir .. py:method:: layoutA() -> _ods_ir .. py:method:: layoutB() -> _ods_ir .. py:method:: satfinite() -> Optional[_ods_ir] .. py:method:: results_() -> _ods_ir .. py:function:: wgmma_mma_async(results_, inouts, descriptor_a, descriptor_b, shape, type_a, type_b, type_d, scale_d, scale_a, scale_b, layout_a, layout_b, *, satfinite=None, loc=None, ip=None) -> _ods_ir .. py:class:: WgmmaWaitGroupSyncOp(group, *, loc=None, ip=None) Bases: :py:obj:`_ods_ir` Signal the completion of a preceding warpgroup operation. `For more information, see PTX ISA `_ .. py:attribute:: OPERATION_NAME :value: 'nvvm.wgmma.wait.group.sync.aligned' .. py:attribute:: _ODS_REGIONS :value: (0, True) .. py:method:: group() -> _ods_ir .. py:function:: wgmma_wait_group_sync_aligned(group, *, loc=None, ip=None) -> WgmmaWaitGroupSyncOp