mlir.dialects._gpu_transform_ops_gen
====================================

.. py:module:: mlir.dialects._gpu_transform_ops_gen


Attributes
----------

.. autoapisummary::

   mlir.dialects._gpu_transform_ops_gen._ods_ir


Classes
-------

.. autoapisummary::

   mlir.dialects._gpu_transform_ops_gen.ApplyGPUPromoteShuffleToAMDGPUPatternsOp
   mlir.dialects._gpu_transform_ops_gen.ApplyGPURewritePatternsOp
   mlir.dialects._gpu_transform_ops_gen.ApplyGPUSubgroupReduceToNVVMConversionPatternsOp
   mlir.dialects._gpu_transform_ops_gen.ApplyGPUToNVVMConversionPatternsOp
   mlir.dialects._gpu_transform_ops_gen.ApplyGPUToROCDLConversionPatternsOp
   mlir.dialects._gpu_transform_ops_gen.ApplyGPUWwmaToNVVMConversionPatternsOp
   mlir.dialects._gpu_transform_ops_gen.ApplyUnrollVectorsSubgroupMmaOp
   mlir.dialects._gpu_transform_ops_gen.EliminateBarriersOp
   mlir.dialects._gpu_transform_ops_gen.MapForallToBlocks
   mlir.dialects._gpu_transform_ops_gen.MapNestedForallToThreads


Functions
---------

.. autoapisummary::

   mlir.dialects._gpu_transform_ops_gen.apply_patterns_gpu_gpu_shuffle_to_amdgpu
   mlir.dialects._gpu_transform_ops_gen.apply_patterns_gpu_gpu_rewrite_patterns
   mlir.dialects._gpu_transform_ops_gen.apply_conversion_patterns_gpu_gpu_subgroup_reduce_to_nvvm
   mlir.dialects._gpu_transform_ops_gen.apply_conversion_patterns_gpu_gpu_to_nvvm
   mlir.dialects._gpu_transform_ops_gen.apply_conversion_patterns_gpu_gpu_to_rocdl
   mlir.dialects._gpu_transform_ops_gen.apply_conversion_patterns_gpu_gpu_wmma_to_nvvm
   mlir.dialects._gpu_transform_ops_gen.apply_patterns_gpu_unroll_vectors_subgroup_mma
   mlir.dialects._gpu_transform_ops_gen.apply_patterns_gpu_eliminate_barriers
   mlir.dialects._gpu_transform_ops_gen.gpu_map_forall_to_blocks
   mlir.dialects._gpu_transform_ops_gen.gpu_map_nested_forall_to_threads


Module Contents
---------------

.. py:data:: _ods_ir

.. py:class:: ApplyGPUPromoteShuffleToAMDGPUPatternsOp(*, chipset=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Collects patterns that are tryin to promote ``gpu.shuffle``s to specialized
   AMDGPU intrinsics.


   .. py:attribute:: OPERATION_NAME
      :value: 'transform.apply_patterns.gpu.gpu_shuffle_to_amdgpu'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: chipset() -> Optional[_ods_ir]


.. py:function:: apply_patterns_gpu_gpu_shuffle_to_amdgpu(*, chipset=None, loc=None, ip=None) -> ApplyGPUPromoteShuffleToAMDGPUPatternsOp

.. py:class:: ApplyGPURewritePatternsOp(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Collects GPU rewrite patterns comprising:

   #. GpuAllReduceRewrite patterns
   #. GpuGlobalIdRewriter patterns
   #. GpuShuffleRewriter patterns


   .. py:attribute:: OPERATION_NAME
      :value: 'transform.apply_patterns.gpu.gpu_rewrite_patterns'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: apply_patterns_gpu_gpu_rewrite_patterns(*, loc=None, ip=None) -> ApplyGPURewritePatternsOp

.. py:class:: ApplyGPUSubgroupReduceToNVVMConversionPatternsOp(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Collects patterns that convert GPU dialect ops related to wmma ops
   to NVVM dialect ops.
   These patterns require an "LLVMTypeConverter".


   .. py:attribute:: OPERATION_NAME
      :value: 'transform.apply_conversion_patterns.gpu.gpu_subgroup_reduce_to_nvvm'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: apply_conversion_patterns_gpu_gpu_subgroup_reduce_to_nvvm(*, loc=None, ip=None) -> ApplyGPUSubgroupReduceToNVVMConversionPatternsOp

.. py:class:: ApplyGPUToNVVMConversionPatternsOp(*, benefit=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Collects patterns that convert GPU dialect ops to NVVM dialect ops. These
   patterns require an "LLVMTypeConverter".


   .. py:attribute:: OPERATION_NAME
      :value: 'transform.apply_conversion_patterns.gpu.gpu_to_nvvm'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: benefit() -> _ods_ir


.. py:function:: apply_conversion_patterns_gpu_gpu_to_nvvm(*, benefit=None, loc=None, ip=None) -> ApplyGPUToNVVMConversionPatternsOp

.. py:class:: ApplyGPUToROCDLConversionPatternsOp(chipset, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Collects patterns that convert GPU dialect ops to ROCDL dialect ops. These
   patterns require an "LLVMTypeConverter".


   .. py:attribute:: OPERATION_NAME
      :value: 'transform.apply_conversion_patterns.gpu.gpu_to_rocdl'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: chipset() -> _ods_ir


.. py:function:: apply_conversion_patterns_gpu_gpu_to_rocdl(chipset, *, loc=None, ip=None) -> ApplyGPUToROCDLConversionPatternsOp

.. py:class:: ApplyGPUWwmaToNVVMConversionPatternsOp(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Collects patterns that convert GPU dialect ops related to wmma ops
   to NVVM dialect ops.
   These patterns require an "LLVMTypeConverter".


   .. py:attribute:: OPERATION_NAME
      :value: 'transform.apply_conversion_patterns.gpu.gpu_wmma_to_nvvm'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: apply_conversion_patterns_gpu_gpu_wmma_to_nvvm(*, loc=None, ip=None) -> ApplyGPUWwmaToNVVMConversionPatternsOp

.. py:class:: ApplyUnrollVectorsSubgroupMmaOp(m, n, k, *, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Unrolls contractions to the target ``m``, ``n``, and ``k`` native vector size,
   along with other vector operations based on expected usage. ``transfer_read``
   ops unroll based on the extract slice shape introduced by unrolling the
   contractions, while elementwise and ``transfer_write`` ops unroll to the shape of
   the C matrix (``m x n``).

   This operation applies to pure vector operations and should be applied before
   lowering to subgroup_mma ops.


   .. py:attribute:: OPERATION_NAME
      :value: 'transform.apply_patterns.gpu.unroll_vectors_subgroup_mma'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: m() -> _ods_ir


   .. py:method:: n() -> _ods_ir


   .. py:method:: k() -> _ods_ir


.. py:function:: apply_patterns_gpu_unroll_vectors_subgroup_mma(m, n, k, *, loc=None, ip=None) -> ApplyUnrollVectorsSubgroupMmaOp

.. py:class:: EliminateBarriersOp(*, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Removes unnecessary GPU barriers from the function. If a barrier does not
   enforce any conflicting pair of memory effects, including a pair that is
   enforced by another barrier, it is unnecessary and can be removed.

   The approach is based on "High-Performance GPU-to-CPU Transpilation and
   Optimization via High-Level Parallel Constructs" by  Moses, Ivanov,
   Domke, Endo, Doerfert, and Zinenko in PPoPP 2023. Specifically, it
   analyzes the memory effects of the operations before and after the given
   barrier and checks if the barrier enforces any of the memory
   effect-induced dependencies that aren't already enforced by another
   barrier.

   For example, in the following code

   .. code:: mlir

         store %A
         barrier  // enforces load-after-store
         load %A
         barrier  // load-after-store already enforced by the previous barrier
         load %A

   the second barrier can be removed.


   .. py:attribute:: OPERATION_NAME
      :value: 'transform.apply_patterns.gpu.eliminate_barriers'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


.. py:function:: apply_patterns_gpu_eliminate_barriers(*, loc=None, ip=None) -> EliminateBarriersOp

.. py:class:: MapForallToBlocks(result, target, *, grid_dims=None, generate_gpu_launch=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Target the gpu_launch op and rewrite the top level ``scf.forall``
   to distributed gpu.block_id attribute. If ``generate_gpu_launch`` attribute
   is set, then first generates ``gpu_launch`` and moves the top level
   ``scf.forall`` inside.

   The operation searches top level ``scf.forall`` ops under
   ``gpu_launch`` and maps each such op to GPU blocks. Mapping is
   one-to-one and the induction variables of ``scf.forall`` are
   rewritten to gpu.block_id according to the ``thread_dim_mapping`` attribute.

   Dynamic, ``scf.forall`` trip counts are currently not supported.
   Dynamic block dim sizes are currently not supported.

   Only **bufferized** scf.forall are currently supported.
   Only scf.forall distributed to **at most 3 dimensions** are
   currently supported.

   The operation alters the block size of the given gpu_launch using the
   grid_dims argument.

   Return modes:
   -------------

   This operation ignores non-gpu_launch ops and drops them in the return.

   If any scf.forall with tensors is found, the transform definitely
   fails.

   If all the ``scf.forall`` operations contained within the LaunchOp
   referred to by the ``target`` handle lower to GPU properly, the
   transform succeeds. Otherwise the transform definitely fails.

   The returned handle points to the same LaunchOp operand, consuming it and
   producing a new SSA value to satisfy chaining and linearity of the IR
   properties.


   .. py:attribute:: OPERATION_NAME
      :value: 'transform.gpu.map_forall_to_blocks'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: target() -> _ods_ir


   .. py:method:: grid_dims() -> Optional[_ods_ir]


   .. py:method:: generate_gpu_launch() -> bool


   .. py:method:: result() -> _ods_ir

      
      Shortcut to get an op result if it has only one (throws an error otherwise).


.. py:function:: gpu_map_forall_to_blocks(result, target, *, grid_dims=None, generate_gpu_launch=None, loc=None, ip=None) -> _ods_ir

.. py:class:: MapNestedForallToThreads(result, target, *, block_dims=None, sync_after_distribute=None, warp_size=None, loc=None, ip=None)

   Bases: :py:obj:`_ods_ir`


   Target the ``gpu.launch op`` and rewrite all ``scf.forall`` nested in it to
   distributed ``gpu.thread_id`` attribute.

   The operation searches for ``scf.forall`` ops nested under ``target`` and maps
   each such op to GPU threads.

   ``scf.forall`` induction variables are rewritten to ``gpu.thread_id`` according
   to the ``mapping`` attribute.

   Different types of mappings attributes are supported:

   * the block_dims is a list of integers that specifies the number of
   threads in each dimension. This is a mandatory attribute that is used
   to constrain the number of threads in each dimension. If an
   ``scf.forall`` op is mapped to fewer threads, predication occurs.
   * the warp_dims is a list of integers that specifies the number of
   warps in each dimension. This is an optional attribute that is used
   to constrain the number of warps in each dimension. When present, this
   attribute must be specified in a way that is compatible with the
   block_dims attribute. If an ``scf.forall`` op is mapped to fewer warps,
   predication occurs.

   Dynamic ``scf.forall`` trip counts are currently not supported.
   Dynamic block dim sizes are currently not supported.

   Only **bufferized** ``scf.forall`` are currently supported.
   Only ``scf.forall`` distributed to **at most 3 dimensions** are
   currently supported.

   The ``sync_after_distribute``attribute controls whether a ``gpu.barrier`` is
   inserted after each scf.forall op. At this time, this is an all or nothing
   choice. This will need to be tightened in the future.

   The operation alters the block size of the given gpu_launch using the
   mandatory block_dims argument.

   Return modes:
   -------------

   This operation ignores non-``gpu_launch`` ops and drops them in the return.

   If any scf.forall with tensors is found, the transform definitely
   fails.

   If all the ``scf.forall`` operations with gpu.thread mapping contained
   within the ``LaunchOp`` referred to by the ``target`` handle lower to GPU
   properly, the transform succeeds. Otherwise the transform definitely
   fails.

   scf.forall operations with mappings other than gpu.thread are
   ignored.

   The returned handle points to the same LaunchOp operand, consuming it and
   producing a new SSA value to satisfy chaining and linearity of the IR
   properties.

   Example:
   --------

   .. code::

       gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
                  threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
         scf.forall (%i, %j) in (7, 9) {
           ... // body 1
         } {mapping = [#gpu.thread<x>, #gpu.thread<y>, #gpu.thread<z>]}
         scf.forall (%i) in (12) {
           ... // body 2
         } {mapping = [#gpu.thread<x>]}
         gpu.terminator
       }

   is translated to:

   .. code::

       %bdimX = arith.constant 12 : index
       %bdimY = arith.constant 9 : index
       gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
              threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {
         if (threadIdx.x < 9 && threadIdx.y < 7) {
           ... // body 1
         }
         gpu.barrier
         if (threadIdx.y < 1) {
           ... // body 2
         }
         gpu.barrier
         gpu.terminator
       }


   .. py:attribute:: OPERATION_NAME
      :value: 'transform.gpu.map_nested_forall_to_threads'


   .. py:attribute:: _ODS_REGIONS
      :value: (0, True)


   .. py:method:: target() -> _ods_ir


   .. py:method:: block_dims() -> _ods_ir


   .. py:method:: sync_after_distribute() -> _ods_ir


   .. py:method:: warp_size() -> _ods_ir


   .. py:method:: result() -> _ods_ir

      
      Shortcut to get an op result if it has only one (throws an error otherwise).


.. py:function:: gpu_map_nested_forall_to_threads(result, target, *, block_dims=None, sync_after_distribute=None, warp_size=None, loc=None, ip=None) -> _ods_ir