o
    3/iH                     @  s  d dl mZ d dlmZmZmZmZ d dlZd dlmZ d dl	Z	d dl
mZ d dl	mZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZmZm Z m!Z!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 G dd deZ6G dd de6eZ7G dd de6eZ8G dd deZ9G dd de9eZ:G dd de9eZ;e.dd Z<			 	!					 d?d@d=d>Z=e=Z>dS )A    )annotations)
NamedTupleOptionalTupleCallableN)Tensor)Int32Float32
const_expr)GemmSm90)	GemmSm100)GemmDefaultEpiMixin)GemmActMixin)ColVecReducecolvec_reduce_accumulate)make_fake_tensor)
ParamsBasemlir_namedtupletorch2cute_dtype_mapget_device_capacityget_max_active_clusters)		get_majorperm3d_singlemake_scheduler_argsmake_varlen_argsmake_fake_scheduler_argsmake_fake_varlen_argsdiv_for_dtypemake_fake_gemm_tensorscompile_gemm_kernel)	jit_cache)RoundingMode)dact_fn_mapdgate_fn_mapc                   @  s&   e Zd ZejZej	ddd	d
ZdS )GemmDActMixinNepi_loop_tensorsTuple[cute.Tensor, ...]tRS_rDcute.TensortRS_rCOptional[cute.Tensor]returnc                 C  sJ  |d usJ t j| |||d d t|| j}|| | j t|j	d urt
|jj| j}t| jdk rXtjt|ddD ]}|	|| || \||< ||< qC|S tjt|d ddD ]:}|	|d|  |d| d  f|d|  |d| d  f\\|d| < |d| d < \|d| < |d| d < qd|S |}|S )N)r)   d   Tunroll_full      )r   epi_visit_subtilecutemake_rmem_tensor_like	acc_dtypestoreloadtor
   act_fnmake_rmem_tensorlayoutshapearchcutlassrangesize)selfparamsr%   r'   r)   
tRS_rC_acctRS_rPostActi rE   \/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/gemm_dact.pyr1   .   s*   "zGemmDActMixin.epi_visit_subtileNr%   r&   r'   r(   r)   r*   r+   r*   )__name__
__module____qualname__r   EpilogueArgumentsr2   jitr1   rE   rE   rE   rF   r$   )   s
    r$   c                   @     e Zd ZdS )GemmDActSm90NrI   rJ   rK   rE   rE   rE   rF   rO   O       rO   c                   @  rN   )GemmDActSm100NrP   rE   rE   rE   rF   rR   S   rQ   rR   c                   @  sn   e Zd Zg ejedR ZdejdffZe	fZ
eG dd deZdddddd	Zej	ddddZdS )GemmDGatedMixinmColVecReduce
act_bwd_fnNc                   @  sx   e Zd ZU ded< dZded< dZded< dZded< dZd	ed
< dZd	ed< dZ	d	ed< e
jZded< dZded< dS )z!GemmDGatedMixin.EpilogueArgumentsr(   mPostActNzcutlass.Constexpr[Callable]rU   zOptional[Float32 | cute.Tensor]alphabetar*   mRowVecBroadcastmColVecBroadcastrT   zcutlass.Constexpr[int]rounding_modezOptional[Int32 | cute.Tensor]sr_seed)rI   rJ   rK   __annotations__rU   rW   rX   rY   rZ   rT   r!   RNr[   r\   rE   rE   rE   rF   rL   ^   s   
 rL   )locipargsc                C  s   | j jdks
J d| jjdksJ d| jjdksJ d|j| _|jj| _tj	j
|j| _| jd d | _| |}|j|d< | jdi |S )	N   z&GemmDGated only supports 16bit for now    zD storage type must be 32 bitzC storage type must be 32 bitr/   rU   rE   )implicit_dtypewidthd_dtypec_dtyper[   rV   element_typepostact_dtyper=   utils
LayoutEnumfrom_tensorpostact_layoutcta_tile_shape_mnkcta_tile_shape_postact_mn_epi_ops_to_params_dictrU   EpilogueParams)r@   ra   r_   r`   drE   rE   rF   epi_to_underlying_argumentsl   s   


z+GemmDGatedMixin.epi_to_underlying_argumentsr%   r&   r'   r(   r)   r*   r+   c              	   C  sd  |d }|d }|d }|d }|d }	|d u r |d u r |d u s"J |d us(J | j }
|
jdks4J dt||
}t|jt}|| 	t t
|t}t
|t}t
|}t|d urt| jdk ry|| | 	|j  npt||j}t||j}t||j}tjtj|d	gd
ddD ]E}tjtj|dgd
d ddD ]3}tj||d| f ||d| d f f||d	f ||d	f f\||d| f< ||d| d f< qqn||  t| jdk rtt|D ]$}||d|  |d| d  || \|d| < |d| d < ||< qndtt|d D ]Y}||d|  |d| d  f|d| d  |d| d  f|d|  |d| d  f\\|d| < |d| d < \|d| d < |d| d < \|d| < |d| d < q)t|	d urt| |	||d t|d urt| jdk r|| | 	|j  nct||j}t||j}tjtj|d	gd
ddD ]G}tjtj|dgd
d ddD ]4}tj||d| f ||d| d f f||d	f ||d	f f\||d| f< ||d| d f< qݐqt|j|
}|| 	|
 |t|t  |S )NrW   rX   rY   rZ   rT   rb   z+GemmDGatedMixin only supports 16bit for nowr,   r   )modeTr-   r0   r/         )rScale)rd   re   r2   recast_tensorr9   r:   r	   r5   r6   r7   r3   r
   r<   rh   layout_utilsconvert_layout_zero_strider=   r>   r?   mul_packed_f32x2rU   r   )r@   rA   r%   r'   r)   rW   rX   	tDrRowVec	tDrColVectDrColVecReducerd   tRS_rXY_f16x2tRS_rXY_f32x2tRS_rdXY_f32x2tRS_rOuttRS_rD_scaledtDrColVec_mn	tRS_rD_mntRS_rD_scaled_mnmnrD   tRS_rOut_mntRS_rdXY_f16x2rE   rE   rF   r1   |   s   
 
"
	 
"&
z!GemmDGatedMixin.epi_visit_subtile)ra   rL   rG   rH   )rI   rJ   rK   r   _epi_opsr   r=   	Constexpr_extra_param_fieldsr   _epi_param_basesr   r   rL   rs   r2   rM   r1   rE   rE   rE   rF   rS   W   s    rS   c                   @  rN   )GemmDGatedSm90NrP   rE   rE   rE   rF   r      rQ   r   c                   @  rN   )GemmDGatedSm100NrP   rE   rE   rE   rF   r      rQ   r   c           0        s  |dk}|r|d dkrt nt}n
|d dkrtnt}t| |||||||	||d
\}}}}}} }!}"t|}#|
dkr;dnd}$|rC|| fn|| |"f}%t||%|$|#d}&|rt| }'d }(|dkrgt||"|fdd	d}(n|dkrtt||fdd	d}(d })t	 }*|d
krt||"||*fddd})n|dkrt|||*fddd})|j
|&|'|(|)d}+ fdd},|,}-nt| }'|
|&|'}+d }-t|o|d dkd|"}.t|d||r|nd }/t|| ||||||||||||+|.|/|-dS )Ndgatedr   	   )varlen_mgather_Ar   r0   )leading_dimdivisibilityr/   ru   rv   )rZ   rT   c                   s
    | _ d S rG   rd   )gemm_objr   rE   rF   _set_implicit_dtype3  s   
z/_compile_gemm_dact.<locals>._set_implicit_dtypeF)	post_init)r   r   rR   rO   r   r   fake_tensorr#   r2   sym_intrL   r"   r   r   r   )0a_dtypeb_dtyperf   rg   ri   rd   a_majorb_majord_majorc_majorpostact_majortile_shape_mncluster_shape_mnkpingpong
persistentis_dynamic_persistent
activationcolvec_scale_dtypecolvec_scale_ndimcolvec_reduce_dtypecolvec_reduce_ndimr   r   device_capacitygemm_cls_name	is_dgatedGemmClsmAmBmDmCr   r   kldiv_pa
pa_leadingpa_shaperV   r8   mColVecrT   n_tilesepi_argsr   r   scheduler_argsvarlen_argsrE   r   rF   _compile_gemm_dact   s   r   TF   Ar   BOutPreActPostActtile_count_semaphoreOptional[Tensor]r   Optional[str]tile_Minttile_N	cluster_M	cluster_Nr   boolr   r   max_swizzle_sizecolvec_scalecolvec_reducecu_seqlens_mA_idxuse_clc_persistencer+   Nonec           0      C  s  |t v }|s!|tv sJ d| |d u sJ d|d u s!J d|r%dnd}|d u}|d u}|rc|s7J d| ddksBJ d	|ddksMJ d
|ddksXJ d|ddkscJ d|ru|d usmJ d|
dksuJ dd }|r|ddk}t|j }| dksJ d| dksJ d|s|s|tj}|tj}n|j	tjj	}|j	tjj	}t
| |}t
|}t
||}t
||}t
||}t|dd}t|dd} t|dd}!t|dd}"t|dd}#t| j }$t|j }%t|j }&t|j }'t|j }(t| j})|)d dv sJ d|r,|)d dkr,|d us,J dt|$|%|&|'|(||| |!|"|#||f|	|
df|||||d urMt|j nd |d urV|jnd|d urat|j nd |d urj|jnd|||)|}*ddlm}+ |+r|d S |rt|	|
 nd},|rtj|d ||d d d}-n	tj|d d d d}-t|,||}.t|d |}/|)d dkr|*|||||-|.|/d d 	 d S |*|||||-|.|/ d S )NzUnsupported activation z4colvec_scale is only supported for gated activationsz5colvec_reduce is only supported for gated activationsr   dactz!varlen_m requires persistent=Truer0   z!varlen_m requires A to be k-majorz#varlen_m requires Out to be n-majorz&varlen_m requires PreAct to be n-majorz'varlen_m requires PostAct to be n-majorzgather_A requires varlenzgather_A requires cluster_N=1r/   zOut dtype must be fp16 or bf16z!Preact dtype must be fp16 or bf16r   r   r   r   )r   
      z)Only SM90, SM100, and SM110 are supportedr   zFDynamic persistent tile scheduler in SM90 requires a semaphore in GMEM)COMPILE_ONLY)rZ   rT   r[   r\   )r[   r\   )r#   r"   strider   dtypeelement_sizeviewtorchfloat32mTr   r   r   devicer   ndimquack.cache_utilsr   r   rS   rL   r$   r   r   )0r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rd   
AB_swappedA_pB_pOut_pPreAct_p	PostAct_pr   r   r   r   r   r   r   rf   rg   ri   r   compiled_fnr   max_active_clustersr   r   r   rE   rE   rF   	gemm_dactU  s   










	r   )	TTFr   NNNNF)*r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r+   r   )?
__future__r   typingr   r   r   r   r   r   r=   cutlass.cuter2   r   r	   r
   quack.gemm_sm90r   quack.gemm_sm100r   quack.gemm_default_epir   quack.gemm_actr   quack.epi_opsr   r   quack.compile_utilsr   r   quack.cute_dsl_utilsr   r   r   r   r   quack.gemm_tvm_ffi_utilsr   r   r   r   r   r   r   r   r   r   r    quack.roundingr!   quack.layout_utilsry   quack.activationr"   r#   r$   rO   rR   rS   r   r   r   r   gemm_dgatedrE   rE   rE   rF   <module>   sP   ,& 
~ 