o
    3/iV5                  !   @   s  d dl mZmZmZ d dlmZ d dlZd dlmZ d dlm	Z	m
Z
mZmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ G dd deZ0G dd de0eZ1G dd de0eZ2e'dd Z3						d/dededee d ee d!ee d"e4d#e4d$e4d%e4d&e5d'e5d(e5d)e4d*e6eB d+e6eB d,df d-d.Z7dS )0    )TupleOptionalCallable)TensorN)Int32Float32Boolean
const_expr)make_ptr)make_fake_tensor)get_device_capacityget_max_active_clusterstorch2cute_dtype_map)
act_fn_map)GemmActMixin)GemmSm90)	GemmSm100)div_for_dtypeperm3d
get_majors
get_dtypesmake_scheduler_argsmake_fake_scheduler_argscompile_gemm_kernel)	jit_cacheTriangularTileScheduler)VarlenManager)RoundingModec                3   @   s   e Zd Zd!defddZejdejde	ej
df dejjd	ejjd
ejjdejjdejdedej
deej
 deej dejdej
deej deej
 deej
 dee dee dejdedejjdedede	ejjejjf f0ddZd S )"GemmSymmetricMixinFvarlen_mc                 C   s   t S )Nr   )selfr     r"   a/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/gemm_symmetric.pyget_scheduler_class!   s   z&GemmSymmetricMixin.get_scheduler_classparamsepi_smem_tensors.epi_pipelineepi_store_pipelineepi_read_stateepi_producer_stateepi_tileload_acc_subtiletRS_rDtRS_rCtiled_copy_t2rtiled_copy_r2stRS_sDtiled_copy_s2rtSR_rCtSR_sCcopy_Dcopy_Ctile_coord_mnklvarlen_managerepilogue_barriertidxis_tma_warpreturnc           /      C   s  t |
d u}t |d u}| |||||||\}}}tt| jd d |jd }tj||d dfd}t|} |j|  }!| 	|||||||||	}"t |d ur|t
jt| | jddD ]}#||#}$|rw|| ||$|d || |  q^t
| D ]D}#||#}%||	|# | ||"|%}&t |r|| t||d d d |jf | tj  tj  tj  || W d    n1 sw   Y  |  t |d uo|#| j | k r||#| j }$|r|| ||$|d || |  | ||&|	|
}'| |'|&d |||!|#}(|r|  |  |!|# | j })t |rzt | j t!j"ko>| j#t
j$ko>| j%t
j&krm|&d |d d |d d	  |d
 d  |!|# d   }*t'(||	|d d d |)f |*| nt')||	|d d d |)f  t||*|(|d d d |)f  |d }+|d },tj  |  |r|+| j+d  }-|,| j+d  }.t |r||)|%d |-|.kr||)|%d |  q| ,||"|||||| ||fS )N      )stride)unroll)src_idxproducer_statesr_seedr   i  i           )rA   dst_idx)-r	   epi_setup_postactcutezipped_dividemake_layoutcta_tile_shape_mnkshapesizenum_tiles_executed	epi_begincutlassrangeminepi_c_stageget_hier_coordproducer_acquireproducer_commitadvancerange_constexprepi_begin_loopconsumer_waitcopyindexarchfence_view_async_shared	sync_warp	elect_oneconsumer_releaseepi_visit_subtileepi_convert_postactarrive_and_wait	epi_stagerounding_moder   RS	acc_dtyper   d_dtypeBFloat16
copy_utilssr_cvt_copycvt_copyretilecluster_shape_mnkepi_end)/r!   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   tile_schedulerr:   r;   has_Chas_Dtiled_copy_postact_r2stRS_sPostActcopy_postactepi_tile_shapeepi_tile_layoutepi_tile_numnum_prev_subtilesepi_tensorsepi_idxgmem_coord_C
gmem_coordepi_loop_tensorstRS_rPostActtRS_rPostAct_out
epi_bufferseedpid_mpid_nsquare_tile_msquare_tile_nr"   r"   r#   epilogue$   s   























zGemmSymmetricMixin.epilogueN)F)__name__
__module____qualname__boolr$   rI   jitr   EpilogueParamsr   r   rQ   pipelinePipelineAsyncPipelineStateTiler   r   	TiledCopyCoordr   NamedBarrierr   r   r   r"   r"   r"   r#   r       sh    	
r   c                   @      e Zd ZdS )GemmSymmetricSm90Nr   r   r   r"   r"   r"   r#   r          r   c                   @   r   )GemmSymmetricSm100Nr   r"   r"   r"   r#   r      r   r   c           +      C   s  |d dkrt nt}t t t }}}|dkrdnd}|dkr&dnd}|dkr.dnd}|dkr6dnd}t| t|}}t||rJt|nd}}t| |||f||d}t||||f||d}t||||f||d} t||||f||d}!t|}"|	dkrdnd}#t||||f|#|"d}$dd }%d }&t|& }'|j|$|'|%||%|d	}(t|o|d dkd
|})d }*t	|| |
|||d
||||| |!|(|)|*S )Nr   	   kr>   n)leading_dimdivisibilityc                 S   s0   | dkrd S | dkrt dS tt dtjjddS )Nr   r>         ?   )assumed_align)r   r
   rI   AddressSpacegmem)moder"   r"   r#   fake_scalar   s
   z,_compile_gemm_symmetric.<locals>.fake_scalar)alphabetaF)
r   r   rI   sym_intr   fake_tensorr   EpilogueArgumentsr   r   )+a_dtypeb_dtyperj   c_dtypec_majorpostact_dtypea_majorb_majord_majorpostact_majortile_shape_mnrp   pingpong
persistentis_dynamic_persistent
alpha_mode	beta_modedevice_capacityGemmClsmr   l	a_leading	b_leading	d_leading	c_leadingdiv_adiv_bdiv_ddiv_cmAmBmDmCdiv_papostact_leadingmPostActr   
activationact_fnepi_argsscheduler_argsvarlen_argsr"   r"   r#   _compile_gemm_symmetric   s`   r   FT   r   ABDCtile_count_semaphoretile_Mtile_N	cluster_M	cluster_Nr   r   r   max_swizzle_sizer   r   r<   c           +      C   s  |j }t| |||\}}}}|jdkr|dddn|}t||||\}}}}t| |||\}}}}t|j }|ddkr@dnd}t	| j
}|d dv sQJ d|ra|d d	kra|d usaJ d
||f} ||df}!t|trqdn|dkrwdnd}"t|trdn|dkrdnd}#t||||||||||| |!|	|
||"|#|}$ddlm}% |%rd S |
rt|| nd}&dd }'tj|d |'||"|'||#d d d}(t|&||})d }*|d d	kr|$|||||(|)|*d d 	 d S |$|||||(|)|* d S )NrD   r>   r=   r   r   r   )r   
      z)Only SM90, SM100, and SM110 are supportedr   zFDynamic persistent tile scheduler in SM90 requires a semaphore in GMEMr   )COMPILE_ONLYc                 S   s$   |dkrd S |dkrt | S |  S )Nr   r>   )r   data_ptr)scalarr   r"   r"   r#   
scalar_arge  s
   z"gemm_symmetric.<locals>.scalar_arg)r   r   rg   rC   )mTr   ndimpermuter   r   r   dtyper?   r   device
isinstancer   r   quack.cache_utilsr   r   r   r   r   )+r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   PostActA_pB_pD_pC_p	PostAct_pr   r   r   r   r   r   rj   r   r   r   r   r   rp   r   r   compiled_fnr   max_active_clustersr   r   r   r   r"   r"   r#   gemm_symmetric   sv   



r   )FTFr   r   r   )8typingr   r   r   torchr   rQ   cutlass.cuterI   r   r   r   r	   cutlass.cute.runtimer
   quack.compile_utilsr   r   quack.cute_dsl_utilsr   r   r   quack.activationr   quack.gemm_actr   quack.gemm_sm90r   quack.gemm_sm100r   quack.gemm_tvm_ffi_utilsr   r   r   r   r   r   r   r   r   quack.tile_schedulerr   quack.varlen_utilsr   quack.copy_utilsrl   quack.roundingr   r   r   r   r   intr   floatr   r"   r"   r"   r#   <module>   s~    $	 (
Z	
