o
    3/iS                    @   sn  d dl Z d dlmZmZmZmZmZmZ d dlm	Z	 d dl
Z
d dlm  mZ d dlZd dlmZ d dlmZ d dlmZmZ d dlmZmZmZ d dlm  mZ d dlmZmZm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d d	l'm(Z( d d
l)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7 d dl8mZ9 d dl:m;Z; 	 G dd de j<Z=G dd dZ>dS )    N)TupleTypeCallableOptionalUnionLiteral)partial)pipeline_init_arrivepipeline_init_wait)cpasyncwarp	warpgroup)Int32Float32Float16Boolean
const_expr)
LayoutEnum)	dataclass)
ParamsBase)TileSchedulerOptionsTileSchedulerArgumentsTileSchedulerVarlenMTileSchedulerArgumentsVarlenMTileSchedulerPersistenceMode)VarlenArgumentsVarlenManager)make_pipeline_statePipelineTmaCpAsync)RoundingModec                   @   sD   e Zd Ze Ze Ze Ze Ze Z	e Z
e ZdS )NamedBarrierGemmN)__name__
__module____qualname__enumautoEpilogueEpilogueLoadMmaWG0MmaWG1EpiWG0EpiWG1TmemPtr r.   r.   \/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/gemm_sm90.pyr!   O   s    r!   c                3   @   s  e Zd ZdZdZeG dd dZeZ					dde	e
j de	e
j d	eeef d
eeeef dededededefddZdefddZejdejdejdeej deej dededee dejfddZejdejdeej d ejd!ejd"ejd#eej d$eej d%eej d&eej d'ejd(ej d)ej!d*ej!d+ej!d,ej!d-e
j"e# f d.d/Zej	0	0dd1e
j$j%d2e
j$j&d3ee# d4e#d5e'd6ee# d7ee# d8e
j$j&fd9d:Z(ej	dd1e
j$j%d2e
j$j&d3e#d;ee# d4e#d5e'd<ed8e
j$j&fd=d>Z)ejd1e
j$j%d?e
j$j&d@e#dAejdBeej d5e'dCe'd8e
j$j&fdDdEZ*ejdFedGeejdHf dIe
j$j%dJe
j$j%dKe
j$j&dLee
j$j& dMej+dNe#dOejdPeej dQeej, dRej,dSejdTeej- dUeej dVeej dWee# dXee# dYej.dZed[e
j$j/d\e'd]e0d8ee
j$j&e
j$j&f f0d^d_Z1dd<efd`daZ2dejdejdeej fdbdcZ3ejddejdOejdeefdfdgZ4ejdFedGeejdHf dMej+dQeej, dRej,dYej.dZed[e
j$j/d\e'd8eejdHf fdhdiZ5dFedjeejdHf dkej.d8eejdHf fdldmZ6	0ddFedneejdHf dOejdPeej d8eej f
dodpZ7dFedAejdejdYej.d\e'd8d0fdqdrZ8ejdFedjeejdHf dMej+dQeej, dRej,dYej.d8d0fdsdtZ9d0d0dudved8efdwdxZ:d0d0dudFed8e;ej fdydzZ<e=dvee d{eeeef dMej+d8efd|d}Z>dFefd~dZ?dFed8eejdHf fddZ@dCe'deAd fddZBdCe'deAd fddZCdejd8ej,fddZDdejdeeE de	e
j deej d\e'd8eej,ejejf fddZFdejdeEde	e
j dejde
j d\e'd8eej,ejejf fddZGdeHejej,f dejd	ej+dMej+dejdYej.d8eejejf fddZIdejdej dejJfddZKdej ej!B dejJfddZLdd ZMd(ej dejJdefddZNeOd{eeeef dMeeef de	e
j de	e
j dee	e
j  dee	e
j  dededed8eeef fddZPe=	0	0dd{eeeef deeeef dee	e
j  deeef d0B d8eeef f
ddZQe=d{eeeef dMeeef de	e
j deEde	e
j deEdedee	e
j  deEdedee	e
j  deeE ded8eej!ej!ej!eej! f fddZRe=dejdej!dMeeef deAd d8eejejf f
ddZSe=dejdej!deeef ded8eejejf f
ddĄZTdddǄZUe=de	e
j de	e
j de	e
j dee	e
j  deVdeVd8efdd˄ZWd0S )GemmSm90aX  
    This class implements batched matrix multiplication (C = A x B) with support for various data types
    and architectural features specific to Hopper GPUs with persistent tile scheduling and warp specialization.

    :param acc_dtype: Data type for accumulation during computation
    :type acc_dtype: type[cutlass.Numeric]
    :param tile_shape_mn: Shape of the CTA tile (M,N)
    :type tile_shape_mn: Tuple[int, int, int]
    :param cluster_shape_mnk: Cluster dimensions (M,N,K) for parallel processing
    :type cluster_shape_mnk: Tuple[int, int, int]

    :note: Data type requirements:
        - For 16-bit types: A and B must have the same data type
        - For 8-bit types: A and B can have different types (Float8E4M3FN/Float8E5M2) as long as both are 8-bit
        - Float8 types only support k-major layout

    :note: Supported data types:
        - Float16
        - BFloat16
        - Float8E4M3FN/Float8E5M2

    :note: Supported accumulation types:
        - Float32 (for all floating point inputs)

    :note: Constraints:
        - Cluster shape M/N must be positive and power of 2, total cluster size <= 4

    Example:
        >>> gemm = GemmSm90(
        ...     acc_dtype=Float32,
        ...     tile_shape_mn=(128, 256),
        ...     cluster_shape_mnk=(1, 1, 1)
        ... )
        >>> gemm(a_tensor, b_tensor, c_tensor, stream)
    Z   c                   @   s   e Zd ZdS )zGemmSm90.EpilogueArgumentsN)r"   r#   r$   r.   r.   r.   r/   EpilogueArguments   s    r2   FT	acc_dtypea_dtypetile_shape_mncluster_shape_mnkpingpongis_persistentfp8_fast_accumgather_Ause_clc_persistencec
                 C   s  || _ || _|| _|	| _| jr| jdksJ | jr | js J d| o'|jdk| _|| _|r8|d dks8J d|| _g |dR | _	| j	d | j	d }
}| js|
dvrYt
d|
d	v rz|
d
krcdnd}|d dkro||ksyt
d|
 d| n@|d dkr|dks|d dkr|dkst
dn'|
dvrt
d|
dkrdn|
dkrdnd}|d dkr||kst
d| | js|
dkrd\}}n$|
d
kr|dkrd\}}nd\}}n| j	d dk r| j	d d nd}d}|dv r|dv sJ nd\}}||df| _| jd | _| jr| jdksJ | jd | _| jdk| _| jdk| _d| _t| j| js1dnd | _| jrA| jdksAJ | jdv sIJ d| _| jd | j | _tjd| _| jsc| jndd  | _| jsndnd | _| jd  | _t| j	d!d t| j| j  }| jr|d9 }| js| jd"krd#\| _| _n#|dk}|sd$nd%\| _| _n| jd"krd&\| _| _nd'\| _| _d!| _d!| _ d!| _!d!| _"d!| _#d!| _$d!| _%d(| _&d!S ))a5  
        Initializes the configuration for a Hopper dense GEMM kernel.

        This configuration includes data types for operands, tile shape, cluster configuration,
        and thread layout.

        :param acc_dtype: Data type for accumulation during computation
        :type acc_dtype: type[cutlass.Numeric]
        :param tile_shape_mn: Shape of the CTA tile (M,N)
        :type tile_shape_mn: Tuple[int, int]
        :param cluster_shape_mnk: Cluster dimensions (M,N,K) for parallel processing
        :type cluster_shape_mnk: Tuple[int, int, int]
        d   z+Pingpong gemm requires persistent scheduler      z'Cluster shape N must be 1 for gather A r   )@            @  z+CTA tile shape M must be 64/128/192/256/320)rA   rC   rA   rB          zIf tile_m == z2, CTA tile shape N must be divisible by 32 and <=    i   zRCTA tile shape N must be divisible by 16 and <= 256, or divisible by 32 and <= 512)r?   r@   rA   z/CTA tile shape M must be 64/128/192 if pingpongr?   r@      z0CTA tile shape N must be divisible by 16 and <= rC   )r>      )   r>   rH   )r>   rH   rI   )r>   r>   sm_90   NrI   )rE   rD   )(      )      )8      )rP         )'r3   r7   r8   r;   archwidthfp8_slow_accumr:   r6   cta_tile_shape_mnk
ValueErroratom_layout_mnknum_mcast_ctas_anum_mcast_ctas_b
is_a_mcast
is_b_mcast	occupancymathprodmma_warp_groupsnum_threads_per_warp_groupthreads_per_ctacutlassutilsget_smem_capacity_in_bytessmem_capacitynum_epi_warpsnum_ab_load_warpsab_load_warp_idnum_regs_loadnum_regs_mmaab_stage	epi_stagea_smem_layout_stagedb_smem_layout_stagedepi_smem_layout_stagedepi_tileshared_storagebuffer_align_bytes)selfr3   r4   r5   r6   r7   r8   r9   r:   r;   tile_Mtile_N
tile_N_maxatom_layout_matom_layout_nregs_per_threadheavy_register_pressurer.   r.   r/   __init__   s   (



zGemmSm90.__init__epilogue_argsc                 C   s  t j| j| j| j | j | j| jd| j	d | jd  fd| _
t| jd dkrR| jd }tjd| j	d | d |fdd}tjt| j
j| jd|dfd| _
tj| j
jd	gd
}d}| j	d | j	d || f| _	t| j| _| | j	| j| j| _| | j	| j| j| j| j| j|tjd| j | j	\| _ | _!| _"| j#rd	nd| _$| %| j	| j| j| j| j| j| j | j| j&| j!| j| j'| j"\| _(| _)| _*| _+dS )a  Set up configurations that are dependent on GEMM inputs

        This method configures various attributes based on the input tensor properties
        (data types, leading dimensions) and kernel settings:
        - Configuring tiled MMA
        - Computing MMA/cluster/tile shapes
        - Computing cluster layout
        - Computing multicast CTAs for A/B
        - Computing epilogue subtile
        - Setting up A/B/C stage counts in shared memory
        - Computing A/B/C shared memory layout
        r?   r>   )tiler_mnr=   )r   rH   r>   orderN)permutation_mnkrH   moderK   r   sm_),
sm90_utilsmake_trivial_tiled_mmar4   b_dtypea_layoutsm90_mma_major_modeb_layoutr3   rY   rW   	tiled_mmar   cutemake_ordered_layoutmake_tiled_mmamake_mma_atomopsize	shape_mnkmake_layoutr6   cluster_layout_mnk$_sm90_compute_tile_shape_or_overrided_dtyperr   _compute_stagesc_dtyperd   re   rf   rT   r^   rm   rn   epi_c_stager7   sched_stage_make_smem_layoutsd_layoutc_layoutro   rp   rq   epi_c_smem_layout_staged)ru   r~   atom_npermutation_nmma_inst_shape_kmma_inst_tile_kr.   r.   r/   _setup_attributes  s~   	
zGemmSm90._setup_attributesmAmBmDmCscheduler_argsvarlen_argsstreamc	                    s  |j _|j _|dur|j nd_|dur|j nd_t|_t|_|dur1t|nd_	|dur=t|nd_
tjjdkoLjjkrZtdj dj tjjjjkrrtdjj djj tjjdko~jjdkrtdt|du rt }|jdujksJ |jdu}	|jdu}
dtjfd	d
fdd||fD \}}| tjd}tjd}d\}}tj r|
rjstj|ddn||jd jd fjd \}}|
rtj|ddn||jd jd fjd \}}tj|_tj r- jtj|7  _d\}}t|dur\j |	rEtj|dddn|j!j"t#|drT|j$sVdndd\}}d\}}t|durtj |j%j"dd\}}&|t'(|}j)|	d}*||||||}|(|}|+||j,}|durt-j!nd|durt-j%nd tj.G  fddd}|_/0j1|tj r|n||||||||j2jjj!j%||j3|j4ddgj|dd dS )a  Execute the GEMM operation in steps:
        - Setup static attributes
        - Setup TMA load/store atoms and tensors
        - Compute grid size
        - Define shared storage for kernel
        - Launch the kernel synchronously

        :param mA: Input tensor A
        :type mA: cute.Tensor
        :param mB: Input tensor B
        :type mB: cute.Tensor
        :param mD: Output tensor D
        :type mD: cute.Tensor
        :param stream: CUDA stream for asynchronous execution
        :type stream: cuda.CUstream
        NrF   zType mismatch: z != zType width mismatch: r=   z#a_dtype should be float16 or float8tc                    s   t  fdd jD S )Nc                 3   s4    | ]}t |st j|d  jj dn|V  qdS )r@   )divbyN)r   	is_staticassumeelement_typerU   ).0sr   r.   r/   	<genexpr>  s
    "
z8GemmSm90.__call__.<locals>.new_stride.<locals>.<genexpr>)tuplestrider   r.   r   r/   
new_stride  s   z%GemmSm90.__call__.<locals>.new_stridec              	      s6   g | ]}|d urt |jt j|j |dnd qS )Nr   )r   make_tensoriteratorr   shape)r   r   )r   r.   r/   
<listcomp>  s    z%GemmSm90.__call__.<locals>.<listcomp>NNr   NNr>   )
ragged_dimr   rH   T)r   	ptr_shiftadd_to_outputstoreadd)op_typeloadvarlen_mc                       sF  e Zd ZU ejjejjd f e	d< ejjejj
d f e	d< ejjejjd f e	d< ejjejd f e	d< ejjejjjdurJjnef jf e	d< ejjejjjdurcjne f jf e	d	< e	d
< ejjejjjejf jf e	d< ejjejjjejf jf e	d< dS )z(GemmSm90.__call__.<locals>.SharedStoragerH   ab_pipeline_array_ptrepi_pipeline_array_ptrsched_pipeline_array_ptrrK   
sched_dataNsDsCepisAsB)r"   r#   r$   r   structMemRangerd   Int64rm   __annotations__r   r   r   Alignr   rt   r   epi_get_smem_structr4   cosizero   r   rp   r.   )epi_c_smem_sizeepi_smem_sizeepilogue_paramsru   r.   r/   SharedStorage  s<   
 r   )gridblockclusterr   min_blocks_per_mp)5r   r4   r   r   r   r   from_tensorr   r   r   r   r   rU   	TypeErrorr   mAIdxr:   mCuSeqlensMmCuSeqlensKr   Tensorr   slice_ro   rp   _make_tma_atoms_and_tensors
copy_utilscreate_ragged_tensor_for_tmarW   r6   size_in_bytesnum_tma_load_bytes_make_tma_epi_atoms_and_tensorsrq   rr   hasattrr   r   epi_to_underlying_argumentsr   to_underlying_argumentsget_scheduler_classget_scheduler_argumentsget_grid_shapemax_active_clustersr   r   rs   kernelr   r   launchrc   )ru   r   r   r   r   r~   r   r   r   r   varlen_ka_smem_layoutb_smem_layout
tma_atom_atma_tensor_a
tma_atom_btma_tensor_b
tma_atom_dtma_tensor_d
tma_atom_ctma_tensor_cvarlen_paramsTileSchedulerClstile_sched_argstile_sched_paramsr   r   r.   )r   r   r   r   ru   r/   __call__g  s   










zGemmSm90.__call__r   r   mA_mklr   mB_nklr   mD_mnlr   mC_mnlr  r   r   r   epi_smem_layoutepi_c_smem_layoutr  c           [      C   s@  t |jdu}t |jdu}|r|rJ t | jr|s|sJ t |du}t |	du}tjtj }|| jkrM||||fD ]}t |durLt	
| q?tj }|| j}| j|tdg|jR |j d}d}t |r| jt|d|j d}d}d}t | jr| j||j |d}|jd| jf}t| jdd d	d
 |j j|j!|j"d}|j#j|j!|j"d} d}!t |r|j$j|j!|j"d}!d}"t |r|j%j|j!|j"d}"| &|
|}#t'j(|t)|s|j*du r|jd n|j*jd t)|jd d}$t+|j(|||}t,| jdd d || jkrtj-| j. || jkr|| j| j/ k r| j/dkp8|| jk}%tjtj0 }&|1|&}'tj2||'dd}(tj2||'dd})| j3r]|(nd}(| j4re|)nd})| j/dkpq|| jk}*t t5|dkr|*otj0 dk}*| }+|+6 },t7t8j9j:| j;}-|,j<r|,j=}.|.d }/t | j r|$>||/}0t?|0t@| jAddg|.d df}1n9|$B|/}2t |rt?|2| jAd f|.d f}3|}0n|sJ tC|2| jAd f}3t?|| jAd f|.d df}0t?|$D||/t@| jAddg|.d df}4|$E|/}5|$F|/}6d}7t | j r;tGjH||'d tt|dj|1||(d\}7}8}8nS| I|jJ| jK| j/d }9tjL d tjjM| j  }:|9N|:};d\}7}<t |rxtGjO|;|0||3|5|.d | jAd   |6d}7ntGjP|;|0||3|5|.d | jAd   |6d\}7}<tGjH||'d tt|dj|4| |)d\}=}8}8tQ|6| jAd }>t | j r| R||-|7|=|>}-n| jS||-|7|<|=|>|d}-|+jT|*d |+U },|,j<st | jVo| r|*r|+W|, |+U },|X|- |*r|+X  || jk rtjY| jZ t[| jV r|dkp | jVo |dkp |dk}%tjL \}:}8}8tj|:| j\ }?t | jVr>|:| j\ }:tjt | jV rJ| j]nd| j\d}@|N|@| jVsY|?nd}At^_|A| jA|| \}B}C}Dd}Et | j`ryta|Bj| jb}Et+t^jc||B|C|D}Ft | jVr|?dkr| jdddd | jdddd tQ|jd | jAd }Gt5tQ| jAdd | je}Ht7t8j9jf| j;}I| g }Jt7t8j9jf| jh}Kt7t8j9j:| jh}L| }+|+6 },t | jVr|dkr|Ki|H |Li|H t | r|Ii|G n|$jF|,j=d d}6tQ|6| jAd }>|Ii|> |+T  |+U },|,j<r|,j=}.|.d }/|$F|/}6tQ|6| jAd }>| j||I|F|B|E|>|?}It |rL|>dkrL|Bkd t | jVrX| l|?d t8jmtntojp| jqtjjM d }Md}Nt |r| r||$s||/| jAdd | je|!|.\}N}8}8d}Ot |r| r||$s|	|/| jAdd | je|"|.\}P}8}8tGt|P|}O| judur| juntjv}Q| w|| jx|Q|!|:\}R}S}TtC|B|Sjy}Ut+| jz|U}Vt |r| {|| j|| j}|"|Sjy|:\}W}X}Y}Znd!\}W}Z}X}Y| ~|
|B||.|: | |
|#||J|K|L| je|V|S|Xd|R|T|W|Y|Z|N|O|.|$|M|+|:|%\}K}Lt | jVr*|%r!|JX  | jdd|? dd" t | jV r:|+T  |+U },nL|Ki|H |Li|H t | r[|Ii|G |+jT| j]d# |+U },n+|+T  |+U },|,j<r|$jF|,j=d d}6tQ|6| jAd }>|Ii|> |+T  |+U },|,j<st | jV r|%r|JX  dS dS dS dS )$a7  
        GPU device kernel performing the batched GEMM computation.

        :param tma_atom_a: TMA copy atom for A tensor
        :type tma_atom_a: cute.CopyAtom
        :param mA_mkl: Input tensor A
        :type mA_mkl: cute.Tensor
        :param tma_atom_b: TMA copy atom for B tensor
        :type tma_atom_b: cute.CopyAtom
        :param mB_nkl: Input tensor B
        :type mB_nkl: cute.Tensor
        :param tma_atom_d: TMA copy atom for D tensor
        :type tma_atom_d: cute.CopyAtom
        :param mD_mnl: Output tensor D
        :type mD_mnl: cute.Tensor
        :param tiled_mma: Tiled MMA object
        :type tiled_mma: cute.TiledMma
        :param cluster_layout_mnk: CTA layout
        :type cluster_layout_mnk: cute.Layout
        :param a_smem_layout: Shared memory layout for A
        :type a_smem_layout: cute.ComposedLayout
        :param b_smem_layout: Shared memory layout for B
        :type b_smem_layout: cute.ComposedLayout
        :param epi_smem_layout: Shared memory layout for epilogue
        :type epi_smem_layout: cute.ComposedLayout
        Nr>   )r   cluster_layout_vmnkab_pipeline_mbar_ptrr   )c_smem_layoutepi_pipeline_mbar_ptr)sched_pipeline_mbar_ptrr   rK   T)cluster_shape_mn
is_relaxed)swizzler   )len_m_staticlen_k_static)r  r   rI   rH   )r   Nr   )	cta_coord
cta_layout
src_tensor
dst_tensor
mcast_maskrE   r   )limit_mlimit_k)Nr   r   r   )is_scheduler_warpr   mma)warp_group_idxstager   )	batch_idxg        )
barrier_idnum_threads)NNNNr!  )advance_count)r   cu_seqlens_mcu_seqlens_kr:   r   rT   make_warp_uniformwarp_idxrj   r   prefetch_descriptorrd   re   SmemAllocatorallocaters   make_ab_pipeliner   r   r   data_ptrmake_epi_pipeliner   r   r8   make_sched_pipeliner   r   
get_tensorr   r	   r6   r   outerinnerr   r   r   epi_get_smem_tensorsr   creater   r   r   r
   setmaxregister_decreaserk   ri   block_idx_in_clusterget_flat_coordmake_layout_image_maskr\   r]   r   initial_work_tile_infor   pipelinePipelineUserTypeProducerrm   is_valid_tiletile_idxoffset_batch_A
local_tileselectrW   offset_batch_AIdxflat_divideoffset_batch_Blen_mlen_kr   tma_get_copy_fn_make_gmem_tiled_copy_Ar   r   
thread_idx	WARP_SIZE	get_slicegather_m_get_copy_fngather_k_get_copy_fnceil_divload_ABload_AB_gather_Aadvance_to_next_workget_current_workr7   write_work_tile_to_smemproducer_tailsetmaxregister_increaserl   r   rb   ra   quack_sm90_utilspartition_fragment_ABCrV   make_rmem_tensorr3   
gemm_w_idxpingpong_barrier_arriverr   Consumermake_epi_store_pipeliner   advance_itersr  fillpingpong_barrier_syncNamedBarrierintr!   r'   rh   epilog_gmem_copy_and_partitionoffset_batch_epitma_producer_copy_fnr   BFloat16epilog_smem_store_and_partitionr   layoutepi_load_acc_subtileepilog_smem_load_and_partitionr   r   epi_visit_accepilogue)[ru   r   r   r  r   r  r   r  r   r	  r   r  r   r   r   r
  r  r  r  r   r   has_Dhas_Cr*  tma_atomsmemstorageab_pipelineepi_pipelinesched_pipeliner   r   r   r   r   epi_smem_tensorsvarlen_manageris_tma_warpcta_rank_in_clusterblock_in_cluster_coord_mnka_mcast_maskb_mcast_maskr  tile_scheduler	work_tileab_producer_statetile_coord_mnklr"  mA_mkgA_mkmAIdx_mkgAIdxgB_nkrG  rH  copy_A_tiled_copy_Atidx
thr_copy_A
prefetch_Acopy_B
k_tile_cntr   warp_group_thread_layoutthr_mmaacctCrAtCrBacc_slowmma_fnk_tile_cnt_static
c_tile_cntab_read_stateepi_store_pipelineepi_read_stateepi_producer_stateepilogue_barriercopy_Dcopy_C	copy_C_fnd_dtype_for_layouttiled_copy_r2stRS_rDtRS_sDtRS_rAccload_acc_subtiletiled_copy_s2rtRS_rCtSR_rCtSR_sCr.   r.   r/   r   #  s  1





















	
	

	i





























v  4zGemmSm90.kernelNrs  r  r  r  r  copy_SFAcopy_SFBreturnc                 C   s   t |d u}t |r|d usJ td}	d|k r||}	tj|ddD ]N}
|||	 ||}|j}t |d urA||
||d ||
||d t |rZ||
||d ||
||d || |	  td}	|
d |k rr||}	q$|S )NTr   r>   unrolltma_bar_ptr)
r   r   producer_try_acquirerd   rangeproducer_acquireproducer_get_barrierindexproducer_commitadvance)ru   rs  r  r  r  r  r  r  blockscaledpeek_ab_empty_statusk_tiler  smem_idxr.   r.   r/   rQ    s.   



zGemmSm90.load_ABr  r   c                 C   s  t jt j }td}	d|k r||}	tj|d ddD ]R}
d}t|d ur.||
f}|| j	|
| j
  k}|||	| |j}|rP||}||
||d ||
|g|R   || |  td}	|
d |k rq||}	qd|k r|d }
d}t|d ur||
ddf}|| j	t|r|
| j
 nd k}|||	| |j}|r||}||
||d ||
|g|R ddi || |  |S )	NTr   r>   r  r.   r  )predr  )r   rT   r)  r*  r   r  rd   r  r   rj   ri   r  r  r  producer_cpasync_commitr  )ru   rs  r  r  r  r  r  r   r*  r  r  prefetch_outrx  r  r  r.   r.   r/   rR  
  sL   






zGemmSm90.load_AB_gather_Ar  r  r  r  r   c                 C   s   d}|  }	t||}
t| jr| j|dd td}d|k r$||}td}t|
D ](}|	|| ||j
|j
|d td}|  td}|d |k rU||}q-t| jrgtd ||  tj|
|ddD ]V}|	|| t| jrtd}||j
|j
|d td}t| j rt| ntd || |   ||	 |  |	  td}|d |k r||}qot| jr| jd| dd t| j rtd tj|
ddD ]}||	 |	  qt| jr||  |S )	Nr>   r  r%  Tr   )A_idxB_idx	zero_initFr  )cloneminr   r7   ra  r   consumer_try_waitrd   r  consumer_waitr  r  rV   r   
wait_groupr   r   consumer_releaser\  )ru   rs  r  r  r  r  r  r   k_pipe_mmasab_release_statenum_prologue_mmapeek_ab_full_statusr  r  r.   r.   r/   r  C  s`   














zGemmSm90.mmaparamsrv  .rt  r  r  r  rr   r  r  r  tiled_copy_t2rr  r  r  r  r  r  r  r  rw  r  r  rx  c           ,      C   s  t |
d u}t |d u}| |||||||}tt| jd d |jd }tj|dd}t|}|j	| }| 
|||||||||	} t |d urutjt|| jddD ]}!||!}"|rp|| ||"|d || |  qWt|D ]C}!||!}#||	|! | || |#}$t |r|| t||d d d |jf | tj  tj  tj  || W d    n1 sw   Y  |  t |d uo|!| j |k r||!| j }"|r|| ||"|d || |  | ||$|	|
}%t |d ur| |%|$d ||||!}&|r|  |  ||! | j  }'t |rzt | j!t"j#ko>| j$tj%ko>| j&tj'krm|$d |d d	 |d d
  |d d  ||! d   }(t()||	|d d d |'f |(| nt(*||	|d d d |'f  t |d ur|\})}*}+t|)|)+|&|*d d d |'f  tj  |  |rt |r||'|#d t |d ur|+|'|#d |  qz| ,|| |||||| ||fS )NrH   r>   )r>   r   r   r  )src_idxproducer_statesr_seedr   i  i  rI         )r  dst_idx)-r   epi_setup_postactr   zipped_divider   rW   r   r   r   num_tiles_executed	epi_beginrd   r  r  r   get_hier_coordr  r  r  range_constexprepi_begin_loopr  copyr  rT   fence_view_async_shared	sync_warp	elect_oner  epi_visit_subtileepi_convert_postactarrive_and_waitrn   rounding_moder    RSr3   r   r   rg  r   sr_cvt_copycvt_copyretileepi_end),ru   r  rv  rt  r  r  r  rr   r  r  r  r  r  r  r  r  r  r  r  r  rw  r  r}  r  rx  ro  rn  postact_ctxepi_tile_shapeepi_tile_layoutepi_tile_numnum_prev_subtilesepi_tensorsepi_idxgmem_coord_C
gmem_coordepi_loop_tensorstRS_rPostActtRS_rPostAct_out
epi_bufferseedtiled_copy_postact_r2stRS_sPostActcopy_postactr.   r.   r/   rm    s   























zGemmSm90.epiloguec                 C   s   |st S tS )zPReturn the scheduler class to use. Override in subclasses for custom schedulers.)r   r   )ru   r   r.   r.   r/   r   ,  s   zGemmSm90.get_scheduler_classc                 C   sx  t | j r
tj}nt | jdko| jrtj}nt |jdur#tj}ntj	}t |j
du rs|dur6|jd n|jdu r@|jd n|jjd d }t|jd | jd t|jd | jd |f}	t|	|j|j| j|j|j|d}
|
S |dus|jdus| jrJ dt|jd | jd |j
jd d f}	t|	|dur|jd n|jjd |j
|j|j| jdd | j|j|d	}
|
S )zICreate scheduler arguments. Override in subclasses for custom schedulers.r<   NrH   r   r>   )problem_shape_ntile_mnlraster_order
group_sizer6   tile_count_semaphorebatch_idx_permutepersistence_mode)	r  total_mr'  r  r  r5   r6   r  r  )r   r8   r   NONErT   r;   CLCr  DYNAMICSTATICr   r   r   r   rP  rW   r   r  max_swizzle_sizer6   r  mPostActr:   r   r   )ru   r   r   r   r   r   r~   r  num_problemsr  r  r.   r.   r/   r   0  sZ   




z GemmSm90.get_scheduler_argumentsr  r  c                 C   s   t |d d d |f | d S N)r   autovec_copy)ru   r  r  r  r.   r.   r/   rj  o  s   zGemmSm90.epi_load_acc_subtilec
           
      C      dS Nr.   r.   )
ru   r  rv  rr   r  r  r  rw  r  r  r.   r.   r/   r  s  s   zGemmSm90.epi_beginr  	epi_coordc                 C   r  r	  r.   )ru   r  r  r
  r.   r.   r/   r    s   zGemmSm90.epi_begin_loopr  c                 C      d S r  r.   )ru   r  r  r  r  r.   r.   r/   r    s   zGemmSm90.epi_visit_subtilec                 C   r  r  r.   )ru   r  r  r   r  r  r.   r.   r/   rl    s   zGemmSm90.epi_visit_accc	           	      C   r  r  r.   )	ru   r  r  rr   r  r  r  rw  r  r.   r.   r/   r    s   zGemmSm90.epi_end)locipargsc                C   s   |   S r  )EpilogueParams)ru   r  r  r  r.   r.   r/   r     s   z$GemmSm90.epi_to_underlying_argumentsc                C   s   g S )zSubclasses can override thisr.   )ru   r  r  r  r.   r.   r/   epi_get_tma_atoms  s   zGemmSm90.epi_get_tma_atomsrW   c                 C   r  Nr   r.   )r  rW   rr   r.   r.   r/   epi_smem_bytes_per_stage  s   z!GemmSm90.epi_smem_bytes_per_stagec                 C   s   t jjtdf S r  )r   r   r   r   )ru   r  r.   r.   r/   r     s   zGemmSm90.epi_get_smem_structc                 C   s   t  S r  )r   )ru   r  rr  r.   r.   r/   r5    s   zGemmSm90.epi_get_smem_tensorsr!  r  r   c                 C   B   |dv sJ |dkrt jnt j}tjjt|| d| j d d S Nr  r  rH   )r#  number_of_threads)r!   r)   r+   r   rT   barrierrc  rb   ru   r   r!  r  r.   r.   r/   ra       

zGemmSm90.pingpong_barrier_syncc                 C   r  r  )r!   r)   r+   r   rT   barrier_arriverc  rb   r  r.   r.   r/   r\    r  z GemmSm90.pingpong_barrier_arrivec                 C   sP   t tj| jd ur| j nd| jd d dkrdnddt}t ||}|S )NFr>   rF   r   rK   rH   )num_matrices)	r   make_copy_atomr   StMatrix8x8x16bOpr   is_m_major_crr   r   make_tiled_copy_C_atom)ru   r   copy_atom_Ctiled_copy_C_atomr.   r.   r/   epilog_smem_copy_atom  s   zGemmSm90.epilog_smem_copy_atomr   dtyper   c                 C   s   |d u rt j}| |}tj||| jd}t||}||}	|d ur)|		|nd }
|d ur6|j
d d n| j}|	t|j
}t|| j}|||
fS )N)	elem_ty_delem_ty_accrH   )r   	ROW_MAJORr"  r   sm90_get_smem_store_opr3   r   make_tiled_copy_SrM  partition_Dr   rr   partition_Smake_identity_tensorrZ  )ru   r   r   r#  r   r  r!  copy_atom_r2sr  thr_copy_r2sr  sD_shapetRS_rD_shaper  r.   r.   r/   rh    s   


z(GemmSm90.epilog_smem_store_and_partitionr   r   tRS_rD_layoutc                 C   sX   |  |}t||}t||}	|	|}
|
|}t||}|
|}|	|||fS r  )	r"  r   sm90_get_smem_load_opr   r(  rM  r*  rZ  r  )ru   r   r   r#  r   r0  r  r!  copy_atom_s2rr  thr_copy_s2rr  r  r  r.   r.   r/   rk    s   
	


z'GemmSm90.epilog_smem_load_and_partitionatommD_mnc                 C   sh   t |||d d }t ||}t|jtjtjf}	|	r!||fn||f\}
}tj	|dt 
d|
|dS )NrH   r   r>   )r  r  r  r  )r   rB  r  
isinstancer   r   CopyBulkTensorTileS2GOpCopyReduceBulkTensorTileS2GOpr   rI  r   )ru   r4  r5  r5   rr   r   r  gDtDgD_for_tma_partitionis_s2gr  r  r.   r.   r/   rd    s   
z'GemmSm90.epilog_gmem_copy_and_partitionr  r  c           
   	   C   s   t | j rdnd| jd  }ttjj|}| j| j d }||j	 t
jj }ttjj|}| js6tjnt}	|	j|| j||| j|ddS )Nr>   rE   T)barrier_storage
num_stagesproducer_groupconsumer_grouptx_countcta_layout_vmnk
defer_sync)r   r:   ri   r<  CooperativeGroupAgentThreadrZ   r[   r   r   rT   rL  PipelineTmaAsyncr   r6  rm   r   )
ru   r   r  r  producer_cntab_pipeline_producer_group
mcast_sizeconsumer_arrive_cntab_pipeline_consumer_grouppipeline_clsr.   r.   r/   r.  !  s"   zGemmSm90.make_ab_pipeliner  r  c                 C   sL   t t jj}| j}t t jj|}t| j|}t jj	|| j
|||ddS )NT)r<  r=  r>  r?  r@  rB  )r<  rC  rD  rE  rh   r   r   r   rF  r6  r   )ru   r  r  epi_pipeline_producer_grouprJ  epi_pipeline_consumer_grouptma_copy_c_bytesr.   r.   r/   r0  ;  s   zGemmSm90.make_epi_pipelinec                 C   s0   | j tjj }ttjj|}tjj	| j
|dS )N)r=  r>  )rh   r   rT   rL  r<  rC  rD  rE  PipelineTmaStorer6  rn   )ru   num_epi_threadsepi_store_producer_groupr.   r.   r/   r^  O  s
   z GemmSm90.make_epi_store_pipeliner  r   c                 C   sz   t t jj}t|}| jr|r| jndd | j | }t t jj|}t j	j
|| j||t|dkr8d ddS dddS )Nr>   rK   r   T)r<  r=  r>  r?  consumer_maskrB  )r<  rC  rD  rE  r   r   r7   ra   ri   PipelineAsyncr6  r   r   )ru   r   r  r   sched_pipeline_producer_groupcluster_sizerJ  sched_pipeline_consumer_groupr.   r.   r/   r1  W  s*   
zGemmSm90.make_sched_pipeliner   r   r   rg   r^   c
                 C   s  |d dkrdnd}
|durt ||j d nd}|| ||| }||
 }|du r-dn	|d dkr5dnd}|durI|t ||j d | 7 }t |d}t |d	}t ||j d t ||j d  }d
}||	 | | }|| }|dkr|
|||  | 7 }
||
|fS )a  Computes the number of stages for A/B/C operands based on heuristics.

        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
        :type cta_tile_shape_mnk: Tuple[int, int, int]
        :param a_dtype: Data type of operand A.
        :type a_dtype: type[cutlass.Numeric]
        :param b_dtype: Data type of operand B.
        :type b_dtype: type[cutlass.Numeric]
        :param smem_capacity: Total available shared memory capacity in bytes.
        :type smem_capacity: int
        :param occupancy: Target number of CTAs per SM (occupancy).
        :type occupancy: int

        :return: A tuple containing the computed number of stages for:
                 (A/B operand stages, epilogue stages)
        :rtype: Tuple[int, int]
        r>   rF   rK   rH   Nr=   r   Nr   Nr   NNrS   )r   r   rU   r  r   )clsrW   rr   r4   r   r   r   r~   rg   r^   rn   d_bytes_per_stageepi_bytes_per_stage	epi_bytesr   a_shapeb_shapeab_bytes_per_stagembar_helpers_bytesremaining_bytesrm   r.   r.   r/   r   q  s&     &
zGemmSm90._compute_stagesrY   r   epi_tile_overridec                 C   s  |dur|S | d d dkr0|d dkr0t dtj| dgd}t dtj| dgd}||fS | d d dkrZ|d dkrZt dtj| dgd}t dtj| dgd}||fS |dure|jdkred	nd}t d	tj| dgd}t |tj| dgd}||fS )
aV  Compute the epilogue tile shape or use override if provided.

        :param cta_tile_shape_mnk: CTA tile shape (M,N,K)
        :type cta_tile_shape_mnk: Tuple[int, int, int]
        :param element_type: Data type of elements
        :type element_type: type[cutlass.Numeric]
        :param is_cooperative: Whether to use cooperative approach
        :type is_cooperative: bool
        :param epi_tile_override: Optional override for epilogue tile shape
        :type epi_tile_override: Tuple[int, int] or None

        :return: Computed epilogue tile shape
        :rtype: Tuple[int, int]
        Nr   r@   r>   r   rE   rA   r=   r?   )r_   gcdr   r   rU   )rW   rY   r   rc  tile_mtile_nn_perfr.   r.   r/   r     s   z-GemmSm90._sm90_compute_tile_shape_or_overrider   r   rm   rn   r   c                 C   s  t | d}| tjjk}| tjjk}| |rdnd }tt||||}t j	|t 
|||r5dndd}t | d}| |rDdnd }tt||||}t j	|t 
|||r^dndd}d	}|d	urpt||||	}d	}|
d	ur|d	us|J t|
|||}||||fS )
ag  Create shared memory layouts for A, B, and C tensors.

        :param cta_tile_shape_mnk: CTA tile shape (M,N,K)
        :type cta_tile_shape_mnk: Tuple[int, int, int]
        :param epi_tile: Epilogue tile shape
        :type epi_tile: Tuple[int, int]
        :param a_dtype: Data type for matrix A
        :type a_dtype: type[cutlass.Numeric]
        :param a_layout: Layout enum for matrix A
        :type a_layout: LayoutEnum
        :param b_dtype: Data type for matrix B
        :type b_dtype: type[cutlass.Numeric]
        :param b_layout: Layout enum for matrix B
        :type b_layout: LayoutEnum
        :param ab_stage: Number of stages for A/B tensors
        :type ab_stage: int
        :param d_dtype: Data type for output matrix D
        :type d_dtype: type[cutlass.Numeric]
        :param d_layout: Layout enum for the output matrix C
        :type d_layout: LayoutEnum
        :param epi_stage: Number of epilogue stages
        :type epi_stage: int

        :return: Tuple of shared memory layouts for A, B, and C
        :rtype: Tuple[cute.ComposedLayout, cute.ComposedLayout, cute.ComposedLayout]
        rX  rH   r   )r   r>   rH   )r>   r   rH   r   rY  r>   N)r   r   r   r   OperandMajorModeKmake_smem_layout_atomr   get_smem_layout_atomtile_to_shapeappendrX  make_smem_layout_epi)rW   rr   r4   r   r   r   rm   r   r   rn   r   r   r   a_smem_shapea_is_k_majorb_is_k_majora_major_mode_sizea_smem_layout_atomro   b_smem_shapeb_major_mode_sizeb_smem_layout_atomrp   rq   r   r.   r.   r/   r     sP   ,



zGemmSm90._make_smem_layoutstensor_drq   r   )r   r   r   c           	      C   sv   |dv sJ t |d}t t | j|}|dkrt n|dkr&t ntt j	j
}t|| ||\}}||fS )a  Create TMA atoms and tensors for storing D or loading C.

        :param tensor_d: Output tensor D
        :type tensor_d: cute.Tensor
        :param epi_smem_layout_staged: Shared memory layout for epilogue
        :type epi_smem_layout_staged: cute.ComposedLayout
        :param epi_tile: Epilogue tile shape
        :type epi_tile: Tuple[int, int]

        :return: TMA atom and tensor for C
        :rtype: Tuple[cute.CopyAtom, cute.Tensor]
        )r   r   r   r   r   r   )r   r   compositionmake_identity_layoutr   r   CopyBulkTensorTileG2SOpr7  r8  ReductionOpADDmake_tiled_tma_atom)	rw  rq   rr   r   r
  d_cta_v_layoutr   r   r   r.   r.   r/   r   0  s   z(GemmSm90._make_tma_epi_atoms_and_tensorstensorsmem_layout	smem_tile	mcast_dimc                 C   s8   |dkrt  nt  }t j|| |||d\}}||fS )a  Create TMA atoms and tensors for input tensors.

        :param tensor: Input tensor (A or B)
        :type tensor: cute.Tensor
        :param smem_layout: Shared memory layout for the tensor
        :type smem_layout: cute.ComposedLayout
        :param smem_tile: Shared memory tile shape
        :type smem_tile: Tuple[int, int]
        :param mcast_dim: Multicast dimension
        :type mcast_dim: int

        :return: TMA atom and tensor
        :rtype: Tuple[cute.CopyAtom, cute.Tensor]
        r>   )num_multicast)r   rz   CopyBulkTensorTileG2SMulticastOpr}  )r  r  r  r  r   rp  
tma_tensorr.   r.   r/   r   R  s   
z$GemmSm90._make_tma_atoms_and_tensorsr@   c                 C   s   t jtjtjjd||d}||j }d| }t | jd | }||kr+t	
||}t j|| |f|dfd}	|tjkr^t | jd | }
|
|krQt	
|
|}
t j|
||
 fd|
fd}	|tjkrjt d|fnt |df}t ||	|S )N)
cache_mode)num_bits_per_copyrS   rH   r>   r   r   )r   r  r   	CopyG2SOpLoadCacheModeGLOBALrU   r   rW   r_   rd  r   r   r&  make_tiled_copy_tv)ru   r#  
major_moder$  	copy_bitsatom_async_copy
copy_elemsloads_per_cache_lineshape_dim_1thread_layoutshape_dim_0value_layoutr.   r.   r/   rJ  u  s2   


z GemmSm90._make_gmem_tiled_copy_Aa_majorb_majorc                 C   s   d}| t tjtjtjhvrd}|t tjtjtjhvrd}|tt hvr$d}|dtt tjtjtjhvr3d}| jdkr>| |kr>d}| j|jkrFd}| jdkrO|dksX|jdkrZ|dkrZd}|S )a  
        Check if the dtypes are valid

        :param a_dtype: The data type of tensor A
        :type a_dtype: Type[cutlass.Numeric]
        :param b_dtype: The data type of tensor B
        :type b_dtype: Type[cutlass.Numeric]
        :param acc_dtype: The data type of the accumulator
        :type acc_dtype: Type[cutlass.Numeric]
        :param d_dtype: The data type of the output tensor
        :type d_dtype: Type[cutlass.Numeric]
        :param a_major: major mode of tensor A
        :type a_major: str
        :param b_major: major mode of tensor B
        :type b_major: str

        :return: True if the dtypes are valid, False otherwise
        :rtype: bool
        TFNrF   r=   k)r   rd   rg  Float8E4M3FN
Float8E5M2r   rU   )r4   r   r3   r   r  r  is_validr.   r.   r/   is_valid_dtypes  s.   $zGemmSm90.is_valid_dtypes)FTFFFr   )T)Fr  )r@   )Xr"   r#   r$   __doc__rT   r   r2   r   r  r   rd   Numericr   rc  boolr}   r   r   jitr   r   r   r   r   cudaCUstreamr  r   TiledMmaCopyAtomr   ParamsLayoutComposedLayout	Constexprr   r<  rT  PipelineStater   rQ  rR  r  Tile	TiledCopyThrCopyCoordrb  r   rm  r   r   rj  r  r  r  rl  r  r   listr  staticmethodr  r   r5  r   ra  r\  r"  r   rh  rk  r   rd  Pointerr.  r0  r^  r1  classmethodr   r   r   r   r   rJ  strr  r.   r.   r.   r/   r0   [   s   $
	

 Y	 <	
   A		
'		8	D
	
 $
?	



	












	

9
(
	
[
!

"r0   )?r%   typingr   r   r   r   r   r   	functoolsr   r_   cuda.bindings.driverbindingsdriverr  rd   cutlass.cuter   cutlass.pipeliner<  r	   r
   cutlass.cute.nvgpur   r   r   cutlass.utils.hopper_helpersre   hopper_helpersr   r   r   r   r   r   cutlass.utilsr   dataclassesr   quack.cute_dsl_utilsr   quack.tile_schedulerr   r   r   r   r   r   quack.varlen_utilsr   r   quack.pipeliner   r   quack.copy_utilsr   quack.sm90_utilsrX  quack.roundingr    IntEnumr!   r0   r.   r.   r.   r/   <module>   s0     &