o
    3/i1f                     @   s0  d Z ddlZddlZddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ G dd dZdd
dZG dd dZG dd deZG dd deZG dd deZG dd deZG dd deZejdd ZejdddZ G dd deZ!dS ) a  Composable epilogue operations (EpiOps) for GEMM kernels.

Each EpiOp encapsulates a single tensor kind's behavior across the epilogue lifecycle:
smem allocation, begin (one-time per-tile setup), begin_loop (per-subtile extraction),
end (cleanup).

The ops are composed via ComposableEpiMixin which iterates over a static _epi_ops tuple
to generate epi_smem_bytes_per_stage, epi_get_smem_struct, epi_get_smem_tensors,
epi_begin, and epi_begin_loop automatically.
    N)partial)BooleanFloat32
const_expr)assume_stride_divisibilitysetup_epi_tensor)partition_for_epiloguec                   @   s   e Zd ZdZdZdd ZdS )
EpiContextzGShared context passed to EpiOp.begin methods. Bundles common arguments.)epi_tiletiled_copy_t2rtiled_copy_r2stile_coord_mnklvarlen_managerepilogue_barriertidxpartition_for_epilogue_fnnum_epi_threads	batch_idxtile_Mtile_Nc	           	      C   s   || _ || _|| _|| _|| _|| _|| _|jd | _|jd | _	|d | _
|jtjj | _tt||d ur7|n|||d u d| _d S )Nr         r
   
tiled_copyr   reference_src)r
   r   r   r   r   r   r   cta_tile_shape_mnkr   r   r   num_epi_warpscutearch	WARP_SIZEr   r   r   r   )	selfgemmr
   r   r   r   r   r   r    r"   Z/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/epi_ops.py__init__.   s$   
zEpiContext.__init__N)__name__
__module____qualname____doc__	__slots__r$   r"   r"   r"   r#   r	      s    r	   Tc                 C   s  |r| j n| j}t|}t| jd t| jd }}t|t||f}t| tjj	 }tjtjj	|dfdd}t||}	t
t|	dg}
t
t|	dg}t|
|}tjtjj	|dfdd}t||}t
t|dg}t
t|dg}t||}||fS )u  Derive lane and warp layouts along M and N from the epilogue tiled_copy.

    Follows the CUTLASS Sm90RowReduction / Sm90ColReduction pattern.
    Uses layout_src_tv_tiled (SM90, reference_src=True) or
    layout_dst_tv_tiled (SM100, reference_src=False), matching the C++ impl's
    get_layoutS_TV / get_layoutD_TV selection.

    Returns (lane_layout_MN, warp_layout_MN) where each is a 2D layout (M, N):
      lane_layout_MN[0] = lane_M: (lanes_in_M):(lane_stride_M) — e.g. 8:4
      lane_layout_MN[1] = lane_N: (lanes_in_N):(lane_stride_N) — e.g. 4:1
      warp_layout_MN[0] = warp_M: (warps_in_M):(warp_stride_M) — e.g. 4:1
      warp_layout_MN[1] = warp_N: (warps_in_N):(warp_stride_N) — e.g. 1:0

    For RowVecReduce (reduce along M): shuffle across lane_M, smem reduce across warp_M.
    For ColVecReduce (reduce along N): shuffle across lane_N, direct write (warps_in_N == 1).
    r   r   )r   r   r   stride)r   r   r   )layout_src_tv_tiledlayout_dst_tv_tiledr   right_inversesizetiler_mncompositionmake_layoutr   r   filterselectlayout_utilsconcat_layout)r   r   	layout_tv
ref_layouttile_M_sizetile_N_sizeref_layout_MN	num_warpstv2laneref2lanelane_Mlane_Nlane_layout_MNtv2warpref2warpwarp_Mwarp_Nwarp_layout_MNr"   r"   r#   _get_lane_warp_layoutsM   s$   
"rG   c                   @   sn   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
ejdd Zdd Zdd Zdd ZdS )EpiOpz.Base class for composable epilogue operations.c                 C   s
   || _ d S Nname)r    rK   r"   r"   r#   r$   ~      
zEpiOp.__init__c                 C      g S )zReturn [(field_name, type, default), ...] for auto-generating EpilogueParams.
        Must match the keys returned by to_params().r"   r    r"   r"   r#   param_fields      zEpiOp.param_fieldsc                 C   s   i S )zConvert this op's arg field(s) to param dict entries.
        Returns dict of {param_name: value}. Like EVT's to_underlying_arguments.r"   r    r!   argsr"   r"   r#   	to_params   rP   zEpiOp.to_paramsc                 C      dS )zJBytes of smem needed per stage. arg_tensor is the EpilogueArguments field.r   r"   r    
arg_tensorr   r
   r"   r"   r#   
smem_bytes      zEpiOp.smem_bytesc                 C   rT   )z~Return (field_name, field_type) for @cute.struct, or None if no smem needed.
        params is the full EpilogueParams object.Nr"   r    r!   paramsr"   r"   r#   smem_struct_field   rP   zEpiOp.smem_struct_fieldc                 C   rT   )zoExtract smem tensor from storage.epi. Returns tensor or None.
        params is the full EpilogueParams object.Nr"   r    r!   rZ   storage_epir"   r"   r#   get_smem_tensor   rP   zEpiOp.get_smem_tensorc                 C   rM   )z%Return list of TMA atoms for this op.r"   rY   r"   r"   r#   	tma_atoms   rX   zEpiOp.tma_atomsc                 C   rT   )z6One-time per-tile setup. Returns state for begin_loop.Nr"   )r    r!   paramsmem_tensorctxr"   r"   r#   begin   rP   zEpiOp.beginc                 C   s   |S )z<Per-subtile extraction. Returns value for epi_visit_subtile.r"   )r    r!   state	epi_coordr"   r"   r#   
begin_loop   rX   zEpiOp.begin_loopc                 C   rT   )z6Whether this op issues async copies that need a fence.Fr"   rN   r"   r"   r#   needs_async_fence   rX   zEpiOp.needs_async_fencec
           
      C   rT   )z7Cleanup after all subtiles (reductions, direct writes).Nr"   )
r    r!   r`   rd   r
   r   r   r   r   r   r"   r"   r#   end   s   z	EpiOp.endN)r%   r&   r'   r(   r$   rO   rS   rW   r[   r^   r_   r   jitrc   rf   rg   rh   r"   r"   r"   r#   rH   {   s    
rH   c                       s@   e Zd ZdZd fdd	Zdd Zdd Zejd	d
 Z	  Z
S )Scalarz>Loads a scalar value or device pointer once per tile. No smem.Nc                       t  | || _d S rI   )superr$   dtype)r    rK   rm   	__class__r"   r#   r$         
zScalar.__init__c                 C      | j td fgS rI   rK   objectrN   r"   r"   r#   rO         zScalar.param_fieldsc                 C   s   | j t|| j iS rI   )rK   getattrrQ   r"   r"   r#   rS         zScalar.to_paramsc                 C   s<   d }t |d urt | jd urtj|| jdnt|}|S )N)rm   )r   rm   utilsload_scalar_or_pointer)r    r!   r`   ra   rb   resultr"   r"   r#   rc      s   zScalar.beginrI   )r%   r&   r'   r(   r$   rO   rS   r   ri   rc   __classcell__r"   r"   rn   r#   rj      s    rj   c                   @   s   e Zd ZdZdZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zejdd Zejdd ZdS )VecLoadzBase class for broadcast vector loads (row or col) via cp_async.

    Subclasses set `dim` to 0 (M/col) or 1 (N/row) and override `_get_gmem_vec`
    for varlen handling.
    Nc                 C   rq   rI   rr   rN   r"   r"   r#   rO      rt   zVecLoad.param_fieldsc                 C      | j tt|| j iS rI   rK   r   ru   rQ   r"   r"   r#   rS         zVecLoad.to_paramsc                 C   s
   || j  S rI   dim)r    r   r"   r"   r#   
_tile_size   rL   zVecLoad._tile_sizec                 C   s   | j dkrdS dS )Nr   )r   r   r   r   r   rN   r"   r"   r#   _broadcast_stride   s   zVecLoad._broadcast_stridec                 C   s   | j dkr|jS |jS )Nr   )r   r   r   )r    rb   r"   r"   r#   	_tile_dim   r~   zVecLoad._tile_dimc                 C   s   | j dkrdS dS )Nr   r   r   rN   r"   r"   r#   
_coord_idx   rv   zVecLoad._coord_idxc                 C   s"   |d u rdS |  ||jjd  S Nr      )r   element_typewidthrU   r"   r"   r#   rW      s   zVecLoad.smem_bytesc                 C   s\   t || jd }|d u rdt}}n	| |j}|j}d| j tjjtjj	||f df fS )Nr   s_   )
ru   rK   r   r   r   r   r   structAlignMemRange)r    r!   rZ   tensorr/   rm   r"   r"   r#   r[      s   (zVecLoad.smem_struct_fieldc                 C   s<   t || jd d u rd S t |d| j t| |jS )Nr   )ru   rK   
get_tensorr   r2   r   r   r\   r"   r"   r#   r^      s
   zVecLoad.get_smem_tensorc                 C   rT   )NTr"   rN   r"   r"   r#   rg     s   zVecLoad.needs_async_fencec                 C   s   ||j df S )z@Get the global memory vector for this tile. Override for varlen.N)r   )r    r`   rb   r"   r"   r#   _get_gmem_vec
  s   zVecLoad._get_gmem_vecc              	   C   sh  d }t |d ur|j}t td|j|j }tj||j|dd|j}| 	||}	| 
|}
|j|   }t|	|
f|f}||}||}|t|
}t|	jd ||
  |
}tdt|jd ft}tjt|jd ddD ]}|d|f |k |d|f< qxtj||||d |t|jtj|j|jf|   d}t |j!d ur|j"#|}|S )	N    Tis_asyncr   r   unroll_fullpredr*   )$r   r   maxr   
copy_utilstiled_copy_1dr   	get_slicer   r   r   r   r   r   
local_tilepartition_Spartition_Dmake_identity_tensorminshapemake_rmem_tensorr/   r   cutlassrangecopyr   make_tensoriteratorr2   r   r   r   r   r   retile)r    r!   r`   ra   rb   tDsVrm   num_copy_elemsthr_copymVectile_dim	coord_idxgVectVgVtVsVtVcVlimitr   mr"   r"   r#   rc     s<   



zVecLoad.beginc                 C   s   d }t |d ur>t|dt|d d d |f }t|j|j}tt|t| t	||j
}|| |j
 |S Nr   )r   r   group_modesrankr   layoutr   autovec_copyfilter_zerosmake_rmem_tensor_like	acc_dtypestoreloadto)r    r!   rd   re   tDrV_cvttDsV_curtDrVr"   r"   r#   rf   -  s    zVecLoad.begin_loop)r%   r&   r'   r(   r   rO   rS   r   r   r   r   rW   r[   r^   rg   r   r   ri   rc   rf   r"   r"   r"   r#   r{      s$    	
r{   c                   @   s   e Zd ZdZdZdS )
RowVecLoadzKLoads a row vector (N,) via cp_async, broadcasts along M with stride (0,1).r   N)r%   r&   r'   r(   r   r"   r"   r"   r#   r   9  s    r   c                   @   s>   e Zd ZdZdZejdd Zejdd Zejdd Z	d	S )

ColVecLoadu+  Loads a col vector (M,) via cp_async, broadcasts along N with stride (1,0).

    Optimization: with N-major subtile loop, consecutive epi_n iterations for the same
    epi_m share the same column data. The smem→register copy only runs when epi_n == 0.
    Supports varlen_m via domain_offset.
    r   c                 C   s>   t |jj r||jd f }|S t|jjj|j f|}|S rI   )r   r   varlen_mr   r   domain_offsetrZ   cu_seqlens_m)r    r`   rb   r   r"   r"   r#   r   I  s   zColVecLoad._get_gmem_vecc              	   C   s  d }d }t |d ur|j}t td|j|j }tj||j|dd|j}	| 	||}
| 
|}|j|   }t|
|f|f}|	|}|	|}|	t|}t|j|j||  |}tdt|jd ft}tjt|jd ddD ]}|d|f |k |d|f< q|tj|	|||d |t|jtj |j!|j"f| # d}t |j$d ur|j%&|}t'|d	t(|d
 }t|j)|j*}||gS )Nr   Tr   r   r   r   r   r*   r   )NNNr   )+r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   len_mr   r   r/   r   r   r   r   r   r   r   r   r2   r   r   r   r   r   r   r   r   r   r   )r    r!   r`   ra   rb   r   r   rm   r   r   r   r   r   r   r   r   r   r   r   r   tDsV_subr"   r"   r#   rc   S  sH   



zColVecLoad.beginc           	      C   s   |d |d }}t |d urF|d }|dkrFt|dt|d d d |f }t|j|j}tt|t| |	|
 |j |S )Nr   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   )	r    r!   rd   re   r   r   epi_nr   r   r"   r"   r#   rf   z  s    zColVecLoad.begin_loopN)
r%   r&   r'   r(   r   r   ri   r   rc   rf   r"   r"   r"   r#   r   ?  s    
	
&r   c                       sj   e Zd ZdZd fdd	Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Z  ZS )	TileStorezTile-sized output tensor stored via TMA (e.g. postact).

    Args:
        name: field name in EpilogueArguments/Params (e.g. "mPostAct")
        epi_tile_fn: optional (gemm, epi_tile) -> epi_tile for half-tile (GemmGated)
    Nc                    rk   rI   )rl   r$   epi_tile_fn)r    rK   r   rn   r"   r#   r$     rp   zTileStore.__init__c                 C      d| j  S )N	tma_atom_rJ   rN   r"   r"   r#   _tma_atom_key     zTileStore._tma_atom_keyc                 C   s   d| j  dS )Nepi__smem_layout_stagedrJ   rN   r"   r"   r#   _smem_layout_key  rt   zTileStore._smem_layout_keyc                 C   r   )N	epi_tile_rJ   rN   r"   r"   r#   _epi_tile_key  r   zTileStore._epi_tile_keyc                 C   s>   ddl m} |  t|f| jt|f|  t|f|  t|fgS )Nr   )MISSING)dataclassesr   r   rs   rK   r   r   )r    r   r"   r"   r#   rO     s   
zTileStore.param_fieldsc           	      C   s\   t || j}| jr| ||jnd }t|||d\}}}}|  || j||  ||  |iS )N)r
   )ru   rK   r   r
   r   r   r   r   )	r    r!   rR   r   r
   tma_atom
tma_tensorsmem_layoutepi_tile_outr"   r"   r#   rS     s   zTileStore.to_paramsc                 C   s>   |d u rdS | j d ur|  d |}tt||jjd  S r   )r   r   r/   r   r   r   rU   r"   r"   r#   rW     s
   
zTileStore.smem_bytesc              	   C   sf   |   }t||sd| j tjjtdf fS d| j tjjtjj|jt	t
||f |jf fS )Nr   r   )r   hasattrrK   r   r   r   r   r   postact_dtypecosizeru   buffer_align_bytes)r    r!   rZ   smem_layout_keyr"   r"   r#   r[     s   

zTileStore.smem_struct_fieldc                 C   s@   |   }t||sd S t||}t|d| j j|j|jdS )Nr   )swizzle)r   r   ru   rK   r   outerinner)r    r!   rZ   r]   r   r   r"   r"   r#   r^     s   

zTileStore.get_smem_tensorc                 C   s"   |   }t||rt||gS g S rI   )r   r   ru   )r    r!   rZ   tma_keyr"   r"   r#   r_     s   
zTileStore.tma_atomsrI   )r%   r&   r'   r(   r$   r   r   r   rO   rS   rW   r[   r^   r_   rz   r"   r"   rn   r#   r     s    

r   c                 C   s  t |dur_t | jdk r%tjt|ddD ]}||  || 9  < qn:tjt|d ddD ]-}tj|d|  |d| d  f|d|  |d| d  f\|d| < |d| d < q1t |durt | jdk rtjt|ddD ]}||  || 9  < qvdS tjt|d ddD ]/}tj|d|  |d| d  f|d|  |d| d  f\|d| < |d| d < qdS dS )zNMultiply tRS_rD by colvec and/or rowvec in-place. Uses packed f32x2 on SM100+.Nd   Tr      r   )r   r   r   r   r   r/   mul_packed_f32x2)r!   tRS_rD	tDrColVec	tDrRowVecir"   r"   r#   vec_multiply  s,   r   c              
      s  t |durt |du rdd }t | jdk r@tjt|ddD ]}||| }||  t |dur9|||  n|7  < q!dS t||j}t||jt |dur[t||j}tjtj|dgdddD ]z  fd	d}	||	d}
t |durtj	|
| df | d
f f}n|
}tjd
tj|d
gdd ddD ].}||	|}t |durtj
|| d| f | d| d
 f f|}qtj||}q| df  |d |d
  7  < qhdS dS )a$  Accumulate transform_fn(input) or input * rScale into a ColVecReduce buffer.

    If transform_fn is provided, accumulates transform_fn(input[i]).
    If rScale is provided, accumulates input[i] * rScale[i] (uses mul/fma for SM100).
    If neither, accumulates input directly (identity).
    Nc                 S   s   | S rI   r"   )xr"   r"   r#   <lambda>  s    z*colvec_reduce_accumulate.<locals>.<lambda>r   Tr   r   modec                    s$    d|  f  d|  d f fS )Nr   r   r"   )nr   tRS_rInput_mnr"   r#   r   
  s   $ r   r   )r   r   r   r   r   r/   r5   convert_layout_zero_strider   r   fma_packed_f32x2add_packed_f32x2)r!   	tDrReduce
tRS_rInputtransform_fnrScaler   valtDrReduce_mn	rScale_mninpval0row_sumr   r"   r   r#   colvec_reduce_accumulate  s8   *$$&"r  c                   @   sJ   e Zd ZdZdd Zdd Zejdd Zejdd	 Z	ejd
d Z
dS )ColVecReduceaV  Column vector reduction: accumulates across N subtiles in registers,
    then warp-reduces and writes to gmem in epi_end.

    No smem. The accumulation itself happens in epi_visit_subtile (user code).
    This op handles the register allocation (begin), per-subtile slicing (begin_loop),
    and final warp reduction + gmem write (end).
    c                 C   rq   rI   rr   rN   r"   r"   r#   rO   $  rt   zColVecReduce.param_fieldsc                 C   r|   rI   r}   rQ   r"   r"   r#   rS   '  r~   zColVecReduce.to_paramsc                 C   sZ   d }t |d ur+tj|j|jfdd}|t|tj}t|t}t	|
d |S )Nr   r*   g        )r   r   r2   r   r   r   r   r   r   r   fill)r    r!   r`   ra   rb   r   colvec_mma_layouttDrReduce_layoutr"   r"   r#   rc   *  s   
zColVecReduce.beginc                 C   s4   d }t |d urt|dt|d d d |f }|S r   )r   r   r   r   )r    r!   rd   re   ry   r"   r"   r#   rf   6  s    zColVecReduce.begin_loopc
                  C   sF  t |dur|}
|dur|n|}|du }t||\}}tj|dgd}|dtt|> ks3J dt |dkrb|jd dksBJ t|
}t	j
t|ddD ]}tjj|| tj|d||< qQ|d }t|dksqJ dtt|||	|du d	}|jdd
 \}}|d }|js|jd
 n|jd }|d |k rt |j r||d|d f }nt|jj| f|d|d f }t||f|d f}t|||d |  |}|t||f}t|
|
jd }t||
jd }|d d dkrt	
tj|dgdD ]}|| d }||k r|| ||< qdS dS dS dS )zDIntra-warp shuffle reduction across N lanes, then direct gmem write.Nr   r   z7lanes_in_N must be a power of 2 for butterfly reductionTr   )threads_in_groupz@ColVecReduce assumes all reduction cols are within the same warpr   r   r   r   )Nr   )r   rG   r   r/   intmathlog2r+   r   r   r   r   warp_reductionoperatoraddr   r   r   r   r   r   rZ   r   r   r   r   r   r5   r   r   ) r    r!   r`   rd   r
   r   r   r   r   r   r   r   r   rA   rF   
lanes_in_NtDrReduce_fltr   rE   r   r   r   r   limit_nmColVecgColVeclimit_mtDcDtDrReduce_mtDcD_mr   row_idxr"   r"   r#   rh   =  sr   

'zColVecReduce.endN)r%   r&   r'   r(   rO   rS   r   ri   rc   rf   rh   r"   r"   r"   r#   r    s    

r  )T)NN)"r(   r	  r  	functoolsr   r   cutlass.cuter   r   r   r   quack.epi_utilsr   r   quack.sm90_utilsr   quack.utilsrw   quack.copy_utilsr   quack.layout_utilsr5   r	   rG   rH   rj   r{   r   r   r   ri   r   r  r  r"   r"   r"   r#   <module>   s2   
1.BcJS
%