o
    3/i                  !   @   s  d dl mZmZ d dlmZ d dlZd dlmZ d dlmZm	Z	 d dl
mZmZmZmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ G dd deZ,G dd de,eZ-G dd de,eZ.edd Z/					d+dedededee dedee de0d e0d!e0d"e0d#e1d$e1d%e1d&e0d'ee d(df d)d*Z2dS ),    )
NamedTupleOptional)TensorN)Float32
const_expr)mlir_namedtupletorch2cute_dtype_mapget_device_capacityget_max_active_clusters)ColVecReducecolvec_reduce_accumulatevec_multiply)GemmSm90)	GemmSm100)GemmDefaultEpiMixin)RoundingMode)make_fake_tensor)	jit_cache)	
get_majors
get_dtypesperm3dmake_scheduler_argsmake_varlen_argsmake_fake_scheduler_argsmake_fake_varlen_argsmake_fake_gemm_tensorscompile_gemm_kernelc                   @   sX   e Zd ZdZg ejedR ZeG dd deZ	dddddZ
ejdd	d
ZdS )GemmSqReduceMixinzGEMM + sq_reduce + optional rowvec scaling.

    D_raw = A @ B (+ C), reduce[m] = sum_n(D_raw[m,n]^2), D_out = D_raw * rowvec.
    The sq_sum is computed BEFORE the rowvec scaling.
    mColVecReducec                   @   s   e Zd ZU dZeeejB  ed< dZ	eeejB  ed< dZ
eej ed< dZeej ed< dZeej ed< dZeje ed< ejZeje ed	< dZded
< dS )z#GemmSqReduceMixin.EpilogueArgumentsNalphabetamRowVecBroadcastmColVecBroadcastr   Fadd_to_outputrounding_modesr_seed)__name__
__module____qualname__r   r   r   cuter   __annotations__r    r!   r"   r   r#   cutlass	Constexprboolr   RNr$   intr%    r0   r0   a/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/gemm_sq_reduce.pyEpilogueArguments1   s   
 r2   N)locipc                C   s"   |j | _ | |}| jdi |S )Nr0   )r$   _epi_ops_to_params_dictEpilogueParams)selfargsr3   r4   dr0   r0   r1   epi_to_underlying_arguments>   s   
z-GemmSqReduceMixin.epi_to_underlying_argumentsc           
      C   s   |d }|d }|  }tt|do|jd ur"t|j}||9 }t|d urRtt|d p3|jd u r@||  |j7 }nt|j}	||	|  |j 7 }|	| t
| |||d t| |d | d S )Nr   r!   r   r    )rScale)loadr   hasattrr   utilsload_scalar_or_pointerr    toelement_typestorer   r   )
r7   paramsepi_loop_tensorstRS_rDtRS_rCtDrColVecReduce	tDrRowVecrDr   r    r0   r0   r1   epi_visit_subtileC   s   
z#GemmSqReduceMixin.epi_visit_subtile)N)r&   r'   r(   __doc__r   _epi_opsr   r   r   r2   r:   r)   jitrJ   r0   r0   r0   r1   r   (   s    r   c                   @      e Zd ZdS )GemmSqReduceSm90Nr&   r'   r(   r0   r0   r0   r1   rO   Z       rO   c                   @   rN   )GemmSqReduceSm100NrP   r0   r0   r0   r1   rR   ^   rQ   rR   c                  C   s   |d dkrt nt}t| |||||||\}}}}}}}}t }|dkr1t||||fddd}n
t|||fddd}t|||fddd}|j||d}t|oT|d dkd	|}td	d	d	d }t	|| ||	|
|d	|||||||||S )
Nr   	            )leading_dimdivisibility   )r!   r   F)
rR   rO   r   r)   sym_intfake_tensorr2   r   r   r   ) a_dtypeb_dtyped_dtypec_dtypea_majorb_majord_majorc_majortile_shape_mncluster_shape_mnkpingpong
persistentis_dynamic_persistentcolvec_reduce_dtypecolvec_reduce_ndimrowvec_dtypedevice_capacityGemmClsmAmBmDmCmnkln_tilesr   mRowVecepi_argsscheduler_argsvarlen_argsr0   r0   r1   _compile_gemm_sq_reduceb   sh   
r{   FT   ABDCcolvec_reducetile_count_semaphoretile_Mtile_N	cluster_M	cluster_Nrf   rg   rh   max_swizzle_sizerowvecreturnc           "      C   s^  t | j}|d dv sJ dt| |||\}}}}t||||\}}}}t| |||\}}}}|r@|d dkr@|dus@J dt||||||||||f||	df|
||t|j |j|durbt|j nd|}ddl	m
} |rpdS |rxt||	 nd}tj||ddd	}t|||} tddd}!|d dkr||||||| |!dd	 dS ||||||| |! dS )
zGEMM + sq_reduce + optional rowvec scaling.

    D_raw = A @ B (+ C), colvec_reduce[m] = sum_n(D_raw[m,n]^2), D_out = D_raw * rowvec.
    r   )rS   
      z)Only SM90, SM100, and SM110 are supportedrS   NzFDynamic persistent tile scheduler in SM90 requires a semaphore in GMEMrV   )COMPILE_ONLY)r!   r   r#   r$   )r	   devicer   r   r   r{   r   dtypendimquack.cache_utilsr   r
   r   r2   r   r   )"r}   r~   r   r   r   r   r   r   r   r   rf   rg   rh   r   r   rl   A_pB_pD_pC_pr`   ra   rb   rc   r\   r]   r^   r_   compiled_fnr   max_active_clustersrx   ry   rz   r0   r0   r1   gemm_sq_reduce   sZ   

r   )FTFr|   N)3typingr   r   torchr   r+   cutlass.cuter)   r   r   quack.cute_dsl_utilsr   r   r	   r
   quack.epi_opsr   r   r   quack.gemm_sm90r   quack.gemm_sm100r   quack.gemm_default_epir   quack.roundingr   quack.compile_utilsr   r[   r   r   quack.gemm_tvm_ffi_utilsr   r   r   r   r   r   r   r   r   quack.utilsr>   r   rO   rR   r{   r/   r-   r   r0   r0   r0   r1   <module>   st   ,2
U	
