o
    3/iD                  
   @  s  d dl mZ d dlmZmZmZmZ d dlmZ d dl	m
Z
 d dlZd dlmZ d dlm  mZ d dlm  mZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZmZm Z m!Z!m"Z" d d
l#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z; d dl<m=Z= G dd de*Z>G dd de>e&Z?G dd de>e(Z@dd ZAG dd de>ZBG dd deBe&ZCG dd  d eBe(ZDe6e=jEd fd!d"ZFd#d$d#d%dddde=jEd f
dEdCdDZGeGZHdS )F    )annotations)
NamedTupleTupleOptionalCallable)partial)TensorN)Int32Float32
const_expr)make_ptr)make_fake_tensor)
ParamsBasemlir_namedtupleget_device_capacityget_max_active_clusterstorch2cute_dtype_map)	TileStore)GemmSm90)	GemmSm100)GemmDefaultEpiMixin)		get_majorperm3d_singlemake_scheduler_argsmake_varlen_argsmake_fake_scheduler_argsmake_fake_varlen_argsdiv_for_dtypemake_fake_gemm_tensorscompile_gemm_kernel)	jit_cache)permute_gated_Cregs_b16)
act_fn_mapgate_fn_map)RoundingModec                   @  s   e Zd Zg ejedR ZdejdffZe	fZ
eG dd deZdddddd	Zd
d Zejdd Zej	ddddZdS )GemmActMixinmPostActact_fnNc                   @  sl   e Zd ZU ded< dZded< dZded< dZded< dZd	ed
< dZd	ed< e	j
Zded< dZded< dS )GemmActMixin.EpilogueArgumentscute.Tensorr&   Nz%cutlass.Constexpr[Optional[Callable]]r'   zOptional[Float32 | cute.Tensor]alphabetaOptional[cute.Tensor]mRowVecBroadcastmColVecBroadcastzcutlass.Constexpr[int]rounding_modezOptional[Int32 | cute.Tensor]sr_seed)__name__
__module____qualname____annotations__r'   r*   r+   r-   r.   r$   RNr/   r0    r6   r6   [/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/gemm_act.pyEpilogueArguments1   s   
 r8   locipargsc                C  sX   |j | _ |jj| _tjj|j| _| j	d d | _
| |}|j|d< | jdi |S )N   r'   r6   )r/   r&   element_typepostact_dtypecutlassutils
LayoutEnumfrom_tensorpostact_layoutcta_tile_shape_mnkcta_tile_shape_postact_mn_epi_ops_to_params_dictr'   EpilogueParamsselfr<   r:   r;   dr6   r6   r7   epi_to_underlying_arguments>   s   


z(GemmActMixin.epi_to_underlying_argumentsc                 C  s   || j d  }| jdkrttj|dntj}	|	| j| j| j	}
t
|
|}|||}|d }| |j||j|| j|j||\}}}|||fS )zASetup postact TMA copies and partitions before the epilogue loop.r&   d   )tiled_tmem_load   )_epi_smem_maparchr   sm100_utilsget_smem_store_opsm90_utils_ogsm90_get_smem_store_oprD   r?   	acc_dtypecutemake_tiled_copy_S	get_slicepartition_Depilog_gmem_copy_and_partitiontma_atom_mPostActoffset_batch_epir&   rF   epi_tile_mPostAct)rJ   paramsepi_smem_tensorstiled_copy_r2stiled_copy_t2rtile_coord_mnklvarlen_managertidxsPostActrS   copy_atom_postact_r2stiled_copy_postact_r2stRS_sPostAct	batch_idxcopy_postact_r6   r6   r7   epi_setup_postactJ   s(   


zGemmActMixin.epi_setup_postactc                 C  s   t | jtjko|jtjko| jtjkrZddl	m
} ddlm} |d |d d |d d  |d d	  || d
   }	t|| j}
| }|||	|}|
|||j| j |
S t|| j}
|
| | j |
S )zTConvert postact from acc_dtype to postact_dtype. Override for custom postprocessing.r   )convert_f32_to_bf16_sr)	TensorSSAl   yn< i     i  rO         )r   r/   r$   RSr>   r@   r
   r?   BFloat16quack.roundingrn   cutlass.cute.tensorro   rW   make_rmem_tensor_likeloadstoreshapeto)rJ   tRS_rPostActr0   re   rc   num_prev_subtilesepi_idxrn   ro   seedtRS_rPostAct_outsrc_vecraw_vecr6   r6   r7   epi_convert_postactk   s8   






z GemmActMixin.epi_convert_postactepi_loop_tensorsTuple[cute.Tensor, ...]tRS_rDr)   tRS_rCr,   returnc                 C  s   t | |||| t|jd urft|jj| j}t| j	dk r8t
jt|ddD ]}||| ||< q*|S t
jt|d ddD ]}||d|  |d| d  f\|d| < |d| d < qD|S |}|S )NrM   Tunroll_fullr=   rp   )r   epi_visit_subtiler   r'   rW   make_rmem_tensorlayoutrz   rV   rQ   r@   rangesize)rJ   r_   r   r   r   r|   ir6   r6   r7   r      s   zGemmActMixin.epi_visit_subtile)r<   r8   N)r   r   r   r)   r   r,   r   r,   )r1   r2   r3   r   _epi_opsr   r@   	Constexpr_extra_param_fieldsr   _epi_param_basesr   r   r8   rL   rm   rW   jitr   r   r6   r6   r6   r7   r%   ,   s    !
 r%   c                   @     e Zd ZdS )GemmActSm90Nr1   r2   r3   r6   r6   r6   r7   r          r   c                   @  r   )GemmActSm100Nr   r6   r6   r6   r7   r      r   r   c                 C  s>   t |d tjr|d tdd|d fS |d |d d fS )z8Halve the N dimension of the epi_tile for gated postact.rp   r   r=   )
isinstancerW   Layoutrecast_layout)gemmepi_tiler6   r6   r7   _gated_epi_tile_fn   s   r   c                   @  sX   e Zd Zg ejededR Zddddd	d
Zej		ddddZ
ej	dd ZdS )GemmGatedMixinr&   )epi_tile_fnNr9   r<   r(   r   GemmActMixin.EpilogueParamsc                C  s   |j jjdksJ d| jd u s| j sJ tjj|j  s#J | j	dkr5| j
d d dks5J d|j| _|j j| _tjj|j | _| j
d | j
d d f| _| |}|j|d	< | jd
i |S )N   z-GemmGated only supports 16bit postact for nowZ   rp       r   z2GemmGatedSm90 requires tileN to be divisible by 32r=   r'   r6   )r&   r>   widthd_layoutis_n_major_cr@   rA   rB   rC   rQ   rE   r/   r?   rD   rF   rG   r'   rH   rI   r6   r6   r7   rL      s$   



z*GemmGatedMixin.epi_to_underlying_argumentsr_   r   r   r   r)   r   r,   c                 C  s   t | |||| tdd|j}t|j| j}t| j	dk rAt
jt|ddD ]}||d|  |d| d  ||< q*|S t
jt|d ddD ].}||d|  |d| d  f|d| d  |d| d  f\|d| < |d| d < qM|S )Nr=   rp   rM   Tr      rO   )r   r   rW   r   r   r   rz   rV   r   rQ   r@   r   r   r'   )rJ   r_   r   r   r   tRS_rPostAct_layoutr|   r   r6   r6   r7   r      s   &8z GemmGatedMixin.epi_visit_subtilec              	   C  s0   t | ||||||}t| jdkrt| |S )Nr   )r%   r   r   rQ   r!   )rJ   r|   r0   re   rc   r}   r~   r   r6   r6   r7   r      s   z"GemmGatedMixin.epi_convert_postact)r<   r(   r   r   r   )
r_   r   r   r   r   r)   r   r,   r   r,   )r1   r2   r3   r   r   r   r   rL   rW   r   r   r   r6   r6   r6   r7   r      s    
r   c                   @  r   )GemmGatedSm90Nr   r6   r6   r6   r7   r      r   r   c                   @  r   )GemmGatedSm100Nr   r6   r6   r6   r7   r      r   r   c           /      C  s  |d dkrt td| nttd| }|	dkrdnd}t| |||||||||d
\}}}}}} }!}"t|}#|dkr>t n| }$|dkrFdn|}%|rN||$fn||$|"f}&t||&|%|#d}'t||"| fdd	d}(|d
krtt||"|fdd	d})n|dkrt||fdd	d})nd })|dkrt	| nt
| }*tfdd}+|j|'|*|(|)||+|d},t|o|d dkd|"}-t|d||r|nd }.t|| |
|||||||||||,|-|.S )Nr   	   )actgatednrp   )varlen_mgather_Ar   )leading_dimdivisibilityr   r=   r   c                 S  s0   | dkrd S | dkr|dS t |dtjjddS )Nr   rp   r   )assumed_align)r   rW   AddressSpacegmem)modedtyper6   r6   r7   fake_scalar;  s
   z&_compile_gemm_act.<locals>.fake_scalarr-   r.   r/   r0   F)r   r   r   r   r   r   rW   sym_intfake_tensorr"   r#   r	   r8   r   r   r   )/a_dtypeb_dtyped_dtypec_dtyper?   a_majorb_majord_majorc_majorpostact_majortile_shape_mncluster_shape_mnkpingpong
persistentis_dynamic_persistent
activationrowvec_dtypecolvec_dtypecolvec_ndimr   r   device_capacitygemm_cls_namer/   sr_seed_modeGemmCls
pa_leadingmAmBmDmCmr   kldiv_papa_npa_leading_dimpa_shaper&   mRowVecmColVecr'   r   epi_argsscheduler_argsvarlen_argsr6   r6   r7   _compile_gemm_act   sx   r   FT   Ar   BDOptional[Tensor]CPostActtile_count_semaphorer   Optional[str]tile_Minttile_N	cluster_M	cluster_Nr   boolr   r   max_swizzle_sizerowvec_biascolvec_biascu_seqlens_mA_idxr/   r0   int | Tensorr   Nonec           1      C  s0  |t v rd}n|tv sJ d| d}|d u}|d u}|rI|s$J d| ddks/J d|d ur>|ddks>J d|ddksIJ d	|r[|d usSJ d
|
dks[J dt| |}t|}t||}t||}t||}t|dd}t|dd}|d urt|ddnd }|d urt|ddnd } t|dd}!t| j }"t|j }#|d urt|j nd }$|d urt|j nd }%t|j }&|d ur|jnd}'t| j	}(|(d dv sJ d|t
jkr|(d dksJ d|r|(d dkr|d usJ dt|trdn	|t
jkr	dnd})t|"|#|$|%|&|||| |!||f|	|
df|||||d ur+t|j nd |d ur6t|j nd |'|||(|||)d}*ddlm}+ |+rLd S |rUt|	|
 nd},tfdd}-tj|d ||d |-||)d}.t|,||}/t|d |}0|(d dkr|*|||||.|/|0d d 	 d S |*|||||.|/|0 d S )Nr   zUnsupported activation r   z!varlen_m requires persistent=Truerp   z!varlen_m requires A to be k-majorz!varlen_m requires D to be n-majorz'varlen_m requires PostAct to be n-majorzgather_A requires varlenzgather_A requires cluster_N=1r   r   r   r   )r   
      z)Only SM90, SM100, and SM110 are supportedr   zAStochastic rounding (RoundingMode.RS) requires SM100+ (Blackwell)r   zFDynamic persistent tile scheduler in SM90 requires a semaphore in GMEMr=   )r/   r   )COMPILE_ONLYc                 S  s$   |dkrd S |dkr|| S |   S )Nr   rp   )data_ptr)scalarr   r   r6   r6   r7   
scalar_arg  s
   zgemm_act.<locals>.scalar_argr   )r#   r"   strider   r   r   r   ndimr   devicer$   rs   r   r   r   quack.cache_utilsr  r   r	   r%   r8   r   r   )1r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r/   r0   r   r   r   A_pB_pD_pC_p	PostAct_pr   r   r   r   r   r   r   r   r   r?   r   r   r   compiled_fnr  max_active_clustersr  r   r   r   r6   r6   r7   gemm_actc  s   









"r  ),r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r/   r   r0   r   r   r   )I
__future__r   typingr   r   r   r   	functoolsr   torchr   r@   cutlass.cuterW   cutlass.utils.hopper_helpersrA   hopper_helpersrT   cutlass.utils.blackwell_helpersblackwell_helpersrR   r	   r
   r   cutlass.cute.runtimer   quack.compile_utilsr   r   quack.cute_dsl_utilsr   r   r   r   r   quack.epi_opsr   quack.gemm_sm90r   quack.gemm_sm100r   quack.gemm_default_epir   quack.gemm_tvm_ffi_utilsr   r   r   r   r   r   r   r   r   r  r    quack.layout_utilsr!   quack.activationr"   r#   ru   r$   r%   r   r   r   r   r   r   r5   r   r  
gemm_gatedr6   r6   r6   r7   <module>   sZ   ,z@q 