o
    3/i                   3   @   s  d dl mZ d dlmZ d dlmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d d	lmZmZmZ d d
lmZ d dlmZmZmZmZmZm Z m!Z!m"Z"m#Z# edd Z$dddddddddddddej%d fdedededee dee de&de&de&de&de'de'de'de&dee d ee d!e(eB d"e(eB d#ee d$ee d%ee d&ee d'e'd(e&d)e&eB d*df2d+d,Z)dS )-    )Optional)TensorN)Int32Float32)make_ptr)	jit_cache)make_fake_tensor)get_device_capacityget_max_active_clusterstorch2cute_dtype_map)GemmDefaultEpiMixinGemmDefaultSm90GemmDefaultSm100)RoundingMode)	
get_majors
get_dtypesperm3dmake_scheduler_argsmake_varlen_argsmake_fake_scheduler_argsmake_fake_varlen_argsmake_fake_gemm_tensorscompile_gemm_kernelc           *      C   s(  |d dkrt nt}t| ||||||||||d\}}}}}} }!}"tfdd}#t||"| fddd}$|d	kr@t||"|fddd}%n|dkrNt||fddd}%nd }%|j|#||#||$|%|||#|td
d}&t|ol|d dk||"}'|rt|n|rx|!nd }(t||||(})t	|| ||	|
|||||||||&|'|)S )Nr   	   )varlen_mvarlen_kgather_Ac                 S   s>   | dkrd S | dkr||t krdS dS t|dtjjddS )Nr            ?   )assumed_align)r   r   cuteAddressSpacegmem)modedtype r&   W/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/gemm.pyfake_scalarJ   s
   z"_compile_gemm.<locals>.fake_scalarr   r   )leading_dimdivisibility   r%   alphabetamRowVecBroadcastmColVecBroadcastadd_to_outputrounding_modesr_seed)
r   r   r   r   fake_tensorEpilogueArgumentsr   r   r   r   )*a_dtypeb_dtyped_dtypec_dtypea_majorb_majord_majorc_majortile_shape_mncluster_shape_mnkpingpong
persistentis_dynamic_persistentrowvec_dtypecolvec_dtypecolvec_ndim
alpha_mode	beta_moder2   r   r   r   has_batch_idx_permutedevice_capacityr3   sr_seed_modeGemmClsmAmBmDmCmnklr(   mRowVecmColVecepi_argsscheduler_argsaidx_lenvarlen_argsr&   r&   r'   _compile_gemm   sj   
	r[   FT   r   ABDCtile_count_semaphoretile_Mtile_N	cluster_M	cluster_NrA   rB   rC   max_swizzle_sizerowvec_biascolvec_biasr.   r/   cu_seqlens_mcu_seqlens_kA_idxbatch_idx_permuter2   r3   r4   returnc           4      C   s  |d u}|d u}|p|}|d u}|r|rJ d|r(|s J d|dks(J d|r0|
s0J d|r8|r8J d|rP|  ddksEJ d| ddksPJ d	|rh|  d
dks]J d| d
dkshJ dt| j}|d dv swJ d|tjkr|d dksJ d|r|d dkr|d usJ dt| |||||d\}}}} t|||| \}!}"}#}$t| |||\}%}&}'}(t|t	rdn|dkrdnd})t|t	rdn|dkrdnd}*|d ur|j
nd}+t|t	rdn|tjkrdnd},t|%|&|'|(|!|"|#|$||f||df|	|
||d urt|j nd |d urt|j nd |+|)|*|||||d u|||,}-ddlm}. |.r4d S tfdd}/|
rCt|| nd}0tj|/||)|/||*||d d |/||,tdd}1t|0|||}2t|||}3|d dkr~|-|||| |1|2|3d d 	 d S |-|||| |1|2|3 d S )Nz)Only one of cu_seqlens_m and cu_seqlens_kzgather_A requires varlenr   zgather_A requires cluster_N=1zvarlen requires persistent=Truez)Add to output not supported with varlen_mz!varlen_m requires A to be k-majorz!varlen_m requires D to be n-majorz!varlen_k requires A to be m-majorz!varlen_k requires B to be n-majorr   )r   
      z)Only SM90, SM100, and SM110 are supportedrp   zAStochastic rounding (RoundingMode.RS) requires SM100+ (Blackwell)r   zFDynamic persistent tile scheduler in SM90 requires a semaphore in GMEM)r   r   r+   r   )COMPILE_ONLYc                 S   s$   |dkrd S |dkr|| S |   S )Nr   r   )data_ptr)scalarr$   r%   r&   r&   r'   
scalar_arg   s
   zgemm.<locals>.scalar_argr,   r-   )strider	   devicer   RSr   r   r   
isinstancer   ndimr[   r   r%   quack.cache_utilsrr   r   r
   r   r6   r   r   r   )4r]   r^   r_   r`   ra   rb   rc   rd   re   rA   rB   rC   rf   rg   rh   r.   r/   ri   rj   rk   rl   r2   r3   r4   r   r   varlenr   rJ   A_pB_pD_pC_pr;   r<   r=   r>   r7   r8   r9   r:   rG   rH   rF   rK   compiled_fnrr   ru   max_active_clustersrW   rX   rZ   r&   r&   r'   gemm|   s   


	r   )*typingr   torchr   cutlass.cuter!   cutlassr   r   cutlass.cute.runtimer   r{   r   quack.compile_utilsr   r5   quack.cute_dsl_utilsr	   r
   r   quack.gemm_default_epir   r   r   quack.roundingr   quack.gemm_tvm_ffi_utilsr   r   r   r   r   r   r   r   r   r[   RNintboolfloatr   r&   r&   r&   r'   <module>   s   ,
h	
