o
    3/i.                 $   @   s  d dl mZmZmZ d dlmZ d dlZd dlm  m	Z
 d dlmZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z" d dl#m$Z% d dl&m'Z' d dl(m)Z) dd e
j*dd ee
j+dddZ,dd dd dd dd dd dZ-ed Z.ed Z/ed Z0dd Z1dd  Z2d!e3fd"d#Z4ed$d% e D d&gd'e4id(ddd)d)ddddd*d*de)j5d fd+ed,ed-ed.ee d/ee d0e6eB d1e6eB d2ee d3ee d4ee d5ee d6e7d&e7d7ee d8e8d9e8eB d:df"d;d<Z9ed=d% e D d>d&gd'e4id(						*	dd+ed,ed?ee d@ed.ee d/ee d>e.d2ee d4ee d&e7d7ee d:dfdAdBZ:edCd% e D d>d&gd'e4id(				D	dd+ed,edEedFed@ed>e.d2ee d4ee d&e7d7ee d:dfdGdHZ;ddd)dddddd*dDe)j5d fd+ed,ed-ee d/ee d0e6eB dIeej< d2ee d3ee d4ee d5ee d&e7dJe7d8e8d9e8eB d:efdKdLZej=j>dMdNdOdPdd)dddddd*dDe)j5d dfd+ed,ed-ed/ee d0e6dQee d2ee d3ee d4ee d5ee d&e7dJe7d8e8d9e8dRee d:df dSdTZ?			)				dd+ed,ed-ee d/ee d0e6eB d2ee d3ee d4ee dIeej< d:efdUdVZ@		)	)						*	Ddd+ed,ed.ed-ee d0e6eB d1e6eB dIeej< d2ee d3ee d4ee d5ee d&e7dJe7d:efdWdXZAej=j>dYdNdOdP	)	)							*	*	Ddd+ed,ed.ed-ed0e6d1e6dQee dZee d2ee d3ee d4ee d5ee d6e7d&e7dJe7d:df d[d\ZB			)	)				dd+ed,ed.ed/ee d-ee d0e6eB d1e6eB d2ee d3ee d4ee dIeej< d:efd]d^ZC	)	)					*	Ddd+ed,ed-ed0e6eB d1e6eB d2ee d3ee d4ee d5ee d&e7dJe7d:dfd_d`ZDej=j>dadNdOdP	)	)							*	Ddd+ed,ed-ed0e6d1e6dQee dZee d2ee d3ee d4ee d5ee d&e7dJe7d:dfdbdcZE										D	*	Ddd+ed,ed.ee d/ee d>e0d?ee d@ee dIeej< ddeej< d2ee d4ee dee7d&e7dJe7d:eee ef fdfdgZeZFej=j>dhdidOdjdk						*	Ddd+ed,ed?ee d@ed.ee d/ee d>e.d2ee d4ee d&e7dJe7d:dfdldmZG								Ddd+ed,ed.ee d/ee d>e0d2ee d4ee dIeej< ddeej< dee7d:eee ef fdndoZHeHZI							*			D	Ddd+ed,edEed>e0dFee d@ee dIeej< ddeej< dpee dqe7d2ee d4ee d&e7dJe7fdrdsZeZJej=j>dtdudOdvdk				D	Ddd+ed,edEedFed@ed>e.d2ee d4ee d&e7dJe7d:dfdwdxZK					dd+ed,edEed>e0d2ee d4ee dIeej< ddeej< d:eeef fdydzZLeLZMej=j>d{dNdOd|dk		*	)	)dd+ed,ed-ed.ee d&e7d0e6d1e6d:dfd}d~ZN				*	)	)dd+ed,ed.ee d-ee dIeej< d&e7d0e6eB d1e6eB d:eee ef fddZedd% edD d>d&gd'e4id(						*	dd+ed,ed?ee d@ed.ee d/ee d>e/d2ee d4ee d&e7d7ee d:dfddZOd!e3fddZPedd% edD g dd'ePid(			*			D	dd+ed,edEedFed@edpee d>e/dqe7d2ee d4ee d&e7d7ee d:ee fddZQej=j>ddidOddk						*	Ddd+ed,ed?ee d@ed.ee d/ee d>e/d2ee d4ee d&e7dJe7d:dfddZRej=j>ddudOddk			*			D	Ddd+ed,edEedFed@edpee d>e/dqe7d2ee d4ee d&e7dJe7d:efddZSej=Td			*			D	Ddd+ed,edEedFed@edpee d>eUdqe7d2ee d4ee d&e7dJe7d:efddZVdd ZWeEjT	)	)							*	Ddd+ed,ed-ed0e6d1e6dQee dZee d2ee d3ee d4ee d5ee d&e7dJe7d:dfddZXdddZYdd ZZdd Z[eYe?e9eZd eYeBe9e[d eYeGe: eYeKe; eYeReO eNjT		*	)	)dd+ed,ed-ed.ee d&e7d0e6d1e6d:dfddZ\d!e3fddZ]edd% e D d&gd'e]id(				*	dd+ed,ed-ed.ee dee de6d&e7d7ee d:efddZ^ej=j>ddNdOddk				*	Ddd+ed,ed-ed.ee dee de6d&e7dJe7d:efddZ_ej=Td				*	Ddd+ed,ed-ed.ee dee de6d&e7dJe7d:efddZ`			dd+ed,ed.ee dee de6d:eeef fddZa						*	Ddd+ed,ed.ee dee d-ee dIeej< de6d&e7dJe7d:eeef fddZbedd% e D d>d&gd'e4id(				*	dd+ed,ed?ee d@ed.ee dee d>e.d&e7d7ee d:dfddZcedd% edD d>d&gd'e4id(				*	dd+ed,ed?ee d@ed.ee dee d>e/d&e7d7ee d:dfddZdej=j>ddidOddk				*	Ddd+ed,ed?ee d@ed.ee dee d>e.d&e7dJe7d:dfddZeej=Td				*	Dd	dddZfej=j>ddidOddk				*	Ddd+ed,ed?ee d@ed.ee dee d>e/d&e7dJe7d:dfddƄZgej=Tdá				*	Dd	dddȄZh								*	*	Ddd+ed,edee d.ee d>e0d?ee d@ee dIeej< ddeej< dee7d&e7dJe7d:eee ef fddʄZieiZj				*		dd+ed,edee d.ee d>e0dee7dIeej< ddeej< d:eee ef fdd̄ZkekZldS )    )OptionalTupleLiteral)partialN)Tensor)
GemmConfigget_all_configs)autotuneAutotuneConfig)get_device_capacity)gemm)gemm_act)	gemm_dact)gemm_symmetric)gemm_sq_reduce)gemm_norm_act_fn)rms_final_reduce)RoundingModec                 C   s   | S N xr   r   a/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/gemm_interface.py<lambda>   s    r   c                 C   s   t |  S r   )Frelusquarer   r   r   r   r          tanhapproximate)Nr   relu_sqgelu_tanh_approxc                 C      t | | S r   )r   silugateupr   r   r   r   #   r   c                 C   s   | t d|   |d  S )NgZd;?   torchsigmoidr%   r   r   r   r   $   s    c                 C   r#   r   )r   r   r%   r   r   r   r   %   r   c                 C   s   t j| dd| S )Nr   r   )r   gelur%   r   r   r   r   &   s    c                 C   r#   r   r)   r%   r   r   r   r   '   r   )swiglu
swiglu_oaireglugegluglu)	Nr   r!   r"   r-   r.   r/   r0   r1   c                 C   s8   t | d dkrtddddddd	S td
d
ddddd	S )Nr   
            r(   TF)tile_mtile_n	cluster_m	cluster_npingpongis_dynamic_persistent   )r   r   )devicer   r   r   default_config:   s"   	r>   c                 C   s0   zddl m} || ||W S  ty   Y dS w )zUse nvMatmulHeuristics to pick a config for pure GEMM (no varlen/gather/epilogue).

    Returns None if unavailable, caller should fall back to default_config.
    r   )nvmmh_default_configN)quack.nvmmh_heuristicr?   	Exception)ABdevice_capacityr?   r   r   r   nvmmh_configO   s   rE   
named_argsc                    s   ||B }t |d jd   fdd| D } |dd d u}|dd d u}|s*|r1dd | D } |rLdd | D }  d	krLd
d | D } dd | D } | S )NrB   r   c                    s    g | ]}|j d  j kr|qS config)kwargsrD   .0confrD   r   r   
<listcomp>_        z.prune_invalid_gemm_configs.<locals>.<listcomp>A_idxcu_seqlens_mc                 S      g | ]
}|j d  js|qS rG   rI   swap_abrJ   r   r   r   rN   c       c                 S   s    g | ]}|j d  jdkr|qS )rH   r(   )rI   r9   rJ   r   r   r   rN   e   rO   	   c                 S   s    g | ]}|j d  jdkr|qS )rH      )rI   r7   rJ   r   r   r   rN   g   rO   c                 S   rR   rG   )rI   clcrJ   r   r   r   rN   h   rU   )r   r=   get)configsrF   rI   gather_Avarlen_mr   rM   r   prune_invalid_gemm_configs\   s   r]   c                 C      g | ]}t |d qS rG   r
   rK   cr   r   r   rN   m       rN   dynamic_schedulerearly_config_prune)rZ   keyprune_configs_by      ?FrB   rC   outCbiasalphabetarQ   cu_seqlens_krP   batch_idx_permuteadd_to_outputrH   rounding_modesr_seedreturnc                 C   s  |d u r3|d u o|d u o|	d u o|d u o|d u o| }|r*t | jd }t| ||}|d u r3t| j}|d u}|d u}|p>|}|	d u}|rT|sKJ d|jdksTJ d|r]|jr]J d| jdkri|si| d} |j}|jdkrx|sx|d}|d ur|jdkr|s|d}|jdkr|s|d}|d ur|jdkr|d}|s|j	d n|j	d d }|r|	d ur|	j	d n| j	d }||j	d f}n|| j	d |j	d f}|j	|ksJ d|j	 d	| |p|j
}|rt | d d
krtjdtj| jdnd }t|js| n||js|n| |js|n|j|d ur#|js |n|jnd ||j|j|j|j|jf
d||j|js:|nd |jrA|nd |||||	|
|||d d S )Nr   z-gather_A requires either varlen_m or varlen_kr(   zgather_A requires cluster_n=14Variable-length sequences not supported with swap_abr5   zout shape mismatch: z vs rV   dtyper=   T)
persistentr;   max_swizzle_sizerowvec_biascolvec_biasrl   rm   rQ   rn   rP   ro   rp   rq   rr   )r   r=   rE   r>   r9   rT   ndim	unsqueezemTshaper;   r*   zerosint32gemm_sm90_sm100r6   r7   r8   r:   ry   )rB   rC   ri   rj   rk   rl   rm   rQ   rn   rP   ro   rp   rd   rH   rq   rr   is_pure_gemmrD   r\   varlen_kvarlenr[   
batch_sizetotal_m	out_shapetile_count_semaphorer   r   r   
gemm_tunedl   s   





 
r   c                 C   r^   r_   r`   ra   r   r   r   rN      rc   
activation
preact_outpostact_outc                 C     |
d u r	t | j}
|d u}|r|
jrJ d| jdkr"|s"| d} |j}|jdkr/|d}|d ur?|jdkr?|s?|d}|d urP|jdkrP|sP|d}n|}|jdkr_|s_|d}n|}|d uro|jdkro|d}|	ps|
j}	|	rt| d dkrtj	dtj
| jdnd }t|
js| n||
js|n| |d ur|
js|n|jnd |d ur|
js|n|jnd |
js|n|j|||
j|
j|
j|
j|
jd|	|
j|
js|nd |
jr|nd ||d d S 	Nrt   r5   r   r(   rV   rv   T)rx   r;   ry   rz   r{   rQ   rP   r>   r=   rT   r|   r}   r~   r;   r   r*   r   r   gemm_act_sm90_sm100r6   r7   r8   r9   r:   ry   rB   rC   r   r   rj   rk   r   rQ   rP   rd   rH   r\   DPostActr   r   r   r   gemm_act_tuned   `   







r   c                 C   r^   r_   r`   ra   r   r   r   rN     rc   TPreActdx_outc
                 C   sZ  |	d u r	t | j}	|d u}
|
r|	jrJ d| jdkr"|
s"| d} |j}|jdkr/|d}|jdkr;|
s;|d}|jdkrH|
sH|d}n|}|jdkrW|
sW|d}n|}|p]|	j}|rrt| d dkrrtj	dtj
| jdnd }t|	jsz| n||	js|n| |	js|n|j|	js|n|j|	js|n|j|||	j|	j|	j|	j|	jd||	j||d d S )	Nrt   r5   r   rV   r(   rv   T)rx   r;   ry   rQ   rP   )r>   r=   rT   r|   r}   r~   r;   r   r*   r   r   gemm_dact_sm90_sm100r6   r7   r8   r9   r:   ry   )rB   rC   r   r   r   r   rQ   rP   rd   rH   r\   r   r   r   r   r   r   gemm_dact_tuned  sX   






r   	out_dtypetunedc                 C   sB  |du rh|du r| j n|}|du}|du}|r-|dur |jd n| jd }||jd f}n2|rB|jd d }|| jd |jd f}n| jdkrQ| jd |jd fn| jd | jd |jd f}tj||| jd}t|tso|nd}t|trx|nd}t|tr|nd}t|t	r|nd}t
| |||||||||	|
||||d	 |S )
z4GEMM with optional output tensor and tuning control.Nr   r(   r5   ru   rv   rh   )rk   rl   alpha_tensorrQ   rn   rP   ro   rd   r   rq   rr   sr_seed_tensor)rw   r   r|   r*   emptyr=   
isinstancefloatr   intgemm_out)rB   rC   ri   rk   rl   r   rQ   rn   rP   ro   rd   r   rq   rr   r\   r   r   r   Lr   r   sr_seed_intr   r   r   r   Y  sF   8r   zquack::gemm_outri   cuda)mutates_argsdevice_typesr   r   c                 C   s\   |rt ntt jdd}|dur|n|}|dur|n|}|| ||d||||||	|
||d dS )z&GEMM with pre-allocated output tensor.NrG   )
rj   rk   rl   rQ   rn   rP   ro   rd   rq   rr   r   r   fn)rB   rC   ri   rk   rl   r   rQ   rn   rP   ro   rd   r   rq   rr   r   r   sr_seed_argr   r   r   r     s$   
r   c	              	   C   s  |du r| j n|}|du rG|du rG| jdkrtjntj}	|	| |||d}t|tr-|dkr1||9 }|durE| jdkr<|n|d}||7 }|S |dur|du rk|durX|jd n| jd }
tj	|
|jd f|| j
d	}t|jd d D ]c}|dur| ||| ||d    n| || ||d   }tj||| ||| ||d   d
 t|tr|dkr||| ||d    |9  < |dur||| ||d    || 7  < qt|S |jd d }|du rtj	|| jd |jd f|| j
d	}t|D ]A}|dur| dd||| ||d   f n| dd|| ||d  f }tj|||| ||d  ddf || d
 qt|trH|dkrL||9 }|durU||7 }|S )z<Reference implementation for GEMM with pre-allocated output.N   )r   ri   rh   r5   r(   r   r   rv   r   )rw   r|   r*   bmmmmr   r   r}   r   r   r=   range)rB   rC   ri   rk   rl   rQ   rn   rP   r   r   r   iA_slicer   r   r   r   gemm_ref  sP   !( $$
$2
r   c                 C   sl  |du rh|du r| j n|}|du}|du}|r-|	dur |	jd n| jd }||jd f}n2|rB|jd d }|| jd |jd f}n| jdkrQ| jd |jd fn| jd | jd |jd f}tj||| jd}||u oxt|tox|dkox|du }t|ts|nd}t|tr|nd}t|ts|nd}t|tr|nd}t| ||s|nd||||||||	|
|||d	 |S )
z.GEMM with addition and optional output tensor.Nr   r   r(   r5   ru   rv   rh   )rQ   rn   rP   ro   rp   rd   r   )	rw   r   r|   r*   r   r=   r   r   gemm_add_out)rB   rC   rj   ri   rl   rm   r   rQ   rn   rP   ro   rd   r   r\   r   r   r   r   rp   r   beta_tensorr   r   r   gemm_add  sH   8"
r   zquack::gemm_add_outr   c                 C   sZ   |rt ntt jdd}|dur|n|}|dur|n|}|| |||||||	|
|||d dS )z3GEMM with addition and pre-allocated output tensor.NrG   rl   rm   rQ   rn   rP   ro   rp   rd   r   )rB   rC   rj   ri   rl   rm   r   r   rQ   rn   rP   ro   rp   rd   r   r   r   r   r   r   7  s"   
r   c              	   C   s  |du r[|du r[t |trt |trtj|| ||
|||d}n&|dur&|jn|
dur,|
n| j}
|| |  ||  |
}|durE|| |durY| jdkrP|n|d}||7 }|S |dur|du r|	durl|	j	d n| j	d }|
durw|
n| j}
tj
||j	d f|
| jd}t|j	d d D ]U}|	dur| |	|| ||d    n| || ||d   }||| ||d   }||| ||d   }|t|||  ||  }|dur||| 7 }|| q|S |j	d d }|
dur|
n| j}
|du rtj
|| j	d |j	d f|
| jd}t|D ]O}|	dur-| dd|	|| ||d   f n| dd|| ||d  f }||| ||d  ddf }|t|| |||   }|| | q|durm||7 }|S )zIReference implementation for GEMM with addition and pre-allocated output.N)r   rl   rm   ri   r5   r(   r   r   rv   )r   r   r*   addmmrw   tocopy_r|   r}   r   r   r=   r   r   )rB   rC   rj   rk   ri   rl   rm   rQ   rn   rP   r   resultr   r   r   C_slice	out_slicer   B_slicer   r   r   gemm_add_refe  sX   
%
$
$ 
r   c                 C   sn   t |ts|nd}t |tr|nd}t |ts|nd}t |tr"|nd}t| |||||||||||	|
d dS )a  In-place GEMM with addition: out = alpha * A @ B + beta * out.
    Args:
        A: (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k - input tensor
        B: (K, N) or (L, K, N) or (total_K, N) if varlen_k - input tensor
        out: (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k - tensor to accumulate into (modified in-place)
        alpha: Scalar multiplier for A @ B
        beta: Scalar multiplier for out
        cu_seqlens_m: Optional cumulative sequence lengths for variable M
        cu_seqlens_k: Optional cumulative sequence lengths for variable K
        dynamic_scheduler: Whether to use dynamic scheduler
        tuned: Whether to use autotuned configuration
    Nrh   )rP   ro   rd   r   )r   r   gemm_add_inplace_op)rB   rC   ri   rl   rm   rQ   rn   rP   ro   rd   r   r   r   r   r   r   gemm_add_inplace  s&   
r   zquack::gemm_add_inplacec                 C   s|   |rt ntt jd d}|d ur|n|}|d ur|n|}t|to'|dko'|d u }|| |||s0|nd |||||	|
||d d S )NrG   rh   r   )r   r   r   r   r   )rB   rC   ri   rl   rm   r   r   rQ   rn   rP   ro   rd   r   r   rp   r   r   r   r     s$   

r   postact_dtypestore_preactc                 C   sJ  |t v }|du r| jn|}|du r| jn|}|	du}|r2|
dur%|
jd n| jd }||jd f}n| jdkrB| jd |jd f}n| jd | jd |jd f}|rag |dd |d d R n|}|du rr|rrtj||| jd}|du rtj||| jd}|rt| |||||||	|
|| ||fS t| |||||||	|
|| ||fS )zGGEMM with activation (or gated activation) and optional output tensors.Nr   r   r5   ru   rv   )	gated_to_pytorch_fn_maprw   r   r|   r*   r   r=   gemm_gated_outgemm_act_out)rB   rC   rj   rk   r   r   r   r   r   rQ   rP   r   rd   r   is_gatedr\   r   r   postact_shaper   r   r   r     sX   
&r   zquack::gemm_act_out)r   r   z(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? bias=None, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=False, bool tuned=True) -> ())r   r   schemac                 C   4   |
rt ntt jdd}|| |||||||||	
 dS )z6GEMM with activation and pre-allocated output tensors.NrG   )r   r   r   rB   rC   r   r   rj   rk   r   rQ   rP   rd   r   r   r   r   r   r   H     r   c
                 C   s   |t v }
|d u r| jn|}|d u r| jn|}|d u r$t| ||||d}n
t| |||||d}|
rM|dd d df }|ddd df }t | |||}n	t| ||}|	r_|||fS d |fS )N)rk   rQ   rP   .r5   r(   )r   rw   r   r   r   act_to_pytorch_fn_map)rB   rC   rj   rk   r   rQ   rP   r   r   r   r   preactr&   r'   postactr   r   r   gemm_act_ref`  s   r   colvec_scalecolvec_reducec                 C   s  |t v }|du r| jn|}|du r|jn|}|
du}|r=|dur%|jd n| jd }|r5||jd d fn||jd f}n7| jdkr[|rP| jd |jd d fn	| jd |jd f}n|rd|jd d n|jd }| jd | jd |f}|rg |dd |d d R n|}|du rtj||| jd}|du rtj||| jd}|rt| |||||||	|
|||}|	s||fS |||fS t| ||||||
|||
 ||fS )zPGEMM with activation (or gated activation) gradient and optional output tensors.Nr   r   r5   ru   rv   )	r   rw   r   r|   r*   r   r=   gemm_dgated_outgemm_dact_out)rB   rC   r   r   r   r   r   r   r   r   rQ   rP   rd   r   	is_dgatedr\   r   r   nr   colvec_reduce_finalr   r   r   r     s^   &
2&
r   zquack::gemm_dact_out)r   r   z(Tensor A, Tensor B, Tensor PreAct, Tensor(a3!) dx_out, Tensor(a4!) postact_out, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=True, bool tuned=True) -> ()c
              
   C   s2   |	rt ntt jdd}
|
| ||||||||	 dS )z?GEMM with activation gradient and pre-allocated output tensors.NrG   )r   r   r   )rB   rC   r   r   r   r   rQ   rP   rd   r   r   r   r   r   r     s   r   c                 C   sd  |t v }|du r| jn|}|du r|jn|}t| |||d|}	|r||ddddf }
|ddddf }|
j|j}}|
d |d t | |
|}tjj||
|g|	dd\}}|
| || tj	||gd	d

|j}||||fS t| |}|du r|	}n|j}|d t| |}tjj|||	ddd }|| ||||fS )zQReference implementation for GEMM with activation (or gated activation) gradient.N)rQ   rP   .r5   r(   TF)create_graphr   dimr   )r   rw   r   r   requires_gradrequires_grad_r*   autogradgradstackreshaper   r   )rB   rC   r   r   rQ   rP   r   r   r   doutr&   r'   gate_requires_gradup_requires_gradr   dgatedupdxPreAct_requires_gradpostact_for_gradr   r   r   gemm_dact_ref  s2   





r   zquack::gemm_symmetric_outzz(Tensor A, Tensor B, Tensor(a2!) out, Tensor? C=None, bool dynamic_scheduler=False, float alpha=1.0, float beta=1.0) -> ()c           	      C   s   | j dkr
| d} |j}|j dkr|d}|dur%|j dkr%|d}|j dkr0|d}n|}|r>tjdtj| jdnd}t| jd dkrKdnd}t| ||durV|nd|dur]|nd||dddd	d
d||d dS )z&GEMM with guaranteed symmetric output.r5   r   Nr(   rv   r2   r<   r3   FT   	tile_Mtile_N	cluster_M	cluster_Nr:   rx   ry   rl   rm   )	r|   r}   r~   r*   r   r   r=   r   gemm_symmetric_sm90_sm100)	rB   rC   ri   rj   rd   rl   rm   r   r6   r   r   r   gemm_symmetric_out  s:   






r   c              	   C   s   |du r| j n|}| jdkr| jd |jd f}n| jd | jd |jd f}|du r4tj||| jd}t|tr;|nd}	t|trD|nd}
t| |||||	|
d |S )	zGEMM with symmetric output.Nr5   r   r   ru   rv   rh   )rd   rl   rm   )	rw   r|   r   r*   r   r=   r   r   r   )rB   rC   rj   ri   r   rd   rl   rm   r   	alpha_valbeta_valr   r   r   r   <  s   
r   c                 C   r^   r_   r`   ra   r   r   r   rN   Z  rc   gatedr-   c                 C   r   r   r   r   r   r   r   gemm_gated_tunedY  r   r   c                 K   sD   ||B }| dd d us| ddrdd | D } t| |fi |S )Nr   r   Fc                 S   rR   rG   rS   rJ   r   r   r   rN     rU   z5prune_invalid_gemm_dgated_configs.<locals>.<listcomp>)rY   r]   rZ   rF   rI   r   r   r   !prune_invalid_gemm_dgated_configs  s   r   c                 C   r^   r_   r`   ra   r   r   r   rN     rc   dgated)r   r   rd   c                 C   sH  |d u r	t | j}|d u}|r|jrJ d| jdko| }| jdkr*|s*| d} |j}|jdkr7|d}|jdkrC|sC|d}|jdkrP|sP|d}n|}|jdkr_|s_|d}n|}|d urq|jdkrq|sq|d}|d ur||jr|J d|r|j}|jd | d | }|r|	d ur|	jd n| jd }||f}n| jd | jd |f}tj	|tj
| jd}nd }|
p|j}
|
rt| d dkrtjdtj| jdnd }t|js| n||js|n| |js|n|j|js|n|j|js|n|j|||j|j|j|j|jd	|
|j||||	d
 |r |jdd}|r|d}|S d }|S )Nrt   r5   r   r(   z'colvec_scale not supported with swap_abru   rv   rV   T)rx   r;   ry   r   r   rQ   rP   r   r   )r>   r=   rT   r|   r}   r~   r7   r   r*   r   float32r;   r   r   r   r   r6   r8   r9   r:   ry   sumsqueeze)rB   rC   r   r   r   r   r   r   rQ   rP   rd   rH   r\   	og_ndim_2r   r   r7   shape_nr   colvec_shapecolvec_reduce_partialr   r   r   r   r   gemm_dgated_tuned  s   








r  zquack::gemm_gated_outz(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? bias=None, str activation='swiglu', Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=False, bool tuned=True) -> ()c                 C   r   )z<GEMM with gated activation and pre-allocated output tensors.NrG   )r   r   r   r   r   r   r   r     r   r   zquack::gemm_dgated_outa  (Tensor A, Tensor B, Tensor PreAct, Tensor(a!) dx_out, Tensor(b!) postact_out, Tensor? colvec_scale=None, str activation='swiglu', bool colvec_reduce=False, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=True, bool tuned=True) -> Tensorc                 C   sR   |rt ntt jdd}|| |||||||||	|
}|du r'tjd| jtjdS |S )zEGEMM with gated activation gradient and pre-allocated output tensors.NrG   r   )r=   rw   )r  r   r   r*   r   r=   r   )rB   rC   r   r   r   r   r   r   rQ   rP   rd   r   r   r   r   r   r   r     s"   r   c                 C   s   t t| |||||||||	|
d |stjdtj| jdS |d ur2|	d ur)|	jd n| jd }|f}n| jdkr>| jd f}n
| jd | jd f}tj|tj| jdS )N)r   r   r   rQ   rP   rd   r   rv   r5   ru   )_precompile_default_configr  r*   r   r   r=   r   r|   )rB   rC   r   r   r   r   r   r   rQ   rP   rd   r   r   r   r   r   r   gemm_dgated_out_fakeE  s.   
r  c                 O   st   ddl m} |r|d n|d}|r |du s t|jd tjr"dS z| j|ddi| W dS  ty9   Y dS w )ag  Compile the default config in COMPILE_ONLY mode.

    Checks COMPILE_ONLY flag and SymInt guard, then calls the unwrapped function with
    config=None (which selects the default config), triggering compilation (exports .o)
    without benchmarking or kernel launch.
    Tests use tuned=False which also selects the default config, so this is sufficient.
    r   COMPILE_ONLYrB   NrH   )	quack.cache_utilsr  rY   r   r   r*   SymIntr   rA   )autotuned_fnargsrI   r  rB   r   r   r   r  o  s   r  c                 C   sh   |d ur|n|}|d ur|n|}t |to|dko|d u }tt| |||s&|nd |||||	|
||d d S )Nrh   r   )r   r   r  r   )rB   rC   ri   rl   rm   r   r   rQ   rn   rP   ro   rd   r   r   r   rp   r   r   r   gemm_add_inplace_fake  s$   

r
  c                    s.   ddl }|| j| j fdd}dS )ao  Register a fake that precompiles the default config in COMPILE_ONLY mode.

    For custom_ops that forward args to their autotuned fn. Binds all args by name,
    strips 'tuned', applies optional rewrite(kw), then calls _precompile_default_config.
    PyTorch normalizes all custom_op args to positional, so we use inspect.signature
    to recover keyword names.
    r   Nc                     sR   j | i |}|  t|j}|dd  d ur| t fi | d S )Nr   )bindapply_defaultsdict	argumentspopr  )r	  rI   boundkwr  rewritesigr   r   _fake  s   
z(_register_precompile_fake.<locals>._fake)inspect	signature_init_fnregister_fake)	custom_opr  r  r  r  r   r  r   _register_precompile_fake  s   r  c                 C   s,   |  dd}|dur|| d< | dd dS )z9Merge alpha_tensor into alpha for gemm_tuned; add C=None.r   Nrl   rj   )r  
setdefault)rI   atr   r   r   _rewrite_merge_alpha  s   r  c                 C   s@   |  dd}|dur|| d< |  dd}|dur|| d< dS dS )z>Merge alpha_tensor/beta_tensor into alpha/beta for gemm_tuned.r   Nrl   r   rm   )r  )rI   r  btr   r   r   _rewrite_merge_alpha_beta  s   r   )r  c           	      C   s   ddl m} |rt| jd tjrd S t| jd dkrdnd}zQt| j	dkr,| 
dn| |j	dkr8|j
dn|j|j	dkrD|
dn||d urU|j	dkrS|
dn|nd |rbtjdtj| jdnd |dddd	d
d||d W d S  ty{   Y d S w )Nr   r  r2   r<   r3   r5   r(   rv   FTr   r   )r  r  r   r   r*   r  r   r=   r   r|   r}   r~   r   r   rA   )	rB   rC   ri   rj   rd   rl   rm   r  r6   r   r   r   gemm_symmetric_out_fake  s0   
"r!  c                 K   s   dd | D } t | ||B S )z!ColVecReduce requires no swap_ab.c                 S   rR   rG   rS   rJ   r   r   r   rN      rU   z+_prune_gemm_rms_configs.<locals>.<listcomp>)r]   r   r   r   r   _prune_gemm_rms_configs  s   r"  c                 C   r^   r_   r`   ra   r   r   r   rN     rc   ư>norm_weightepsc                 C   s  |d u r	t | j}| jdk}|jd }	| jdkr| d} |j}|jdkr*|d}|jdkr4|d}|d urB|jdkrB|d}|d urP|jdkrP|d}|j}
|	|
 d |
 }tj| jd | jd |ftj	| jd}|pr|j
}|rt| d dkrtjdtj| jdnd }t| ||||||j|j|j|j|jd||j|d d	|	 }|d|}t|||d
}|| jd d }|r|d}|S )Nr5   r   r   r(   rv   rV   T)rx   r;   ry   rowvecrh   )scaler%  )r>   r=   r|   r   r}   r~   r7   r*   r   r   r;   r   r   r   gemm_sq_reduce_sm90_sm100r6   r8   r9   r:   ry   r   r   r   )rB   rC   ri   rj   r$  r%  rd   rH   r   Nr7   n_tilesr   r   r'  flat_reduce	rstd_flatrstdr   r   r   _gemm_rms_tuned  sf   












r.  zquack::gemm_rms_outz(Tensor A, Tensor B, Tensor(a!) out, Tensor? C=None, Tensor? norm_weight=None, float eps=1e-6, bool dynamic_scheduler=False, bool tuned=True) -> Tensorc           	   	   C   s,   |rt ntt jdd}|| ||||||dS )zGEMM + RMS + optional rowvec scaling.

    D_raw = A @ B (+ C), rstd = rsqrt(mean(D_raw^2) + eps), D_out = D_raw * norm_weight.
    NrG   rj   r$  r%  rd   )r.  r   r   )	rB   rC   ri   rj   r$  r%  rd   r   r   r   r   r   _gemm_rms_outI  s   r0  c           	   
   C   s:   t t| ||||||d | jd d }tj|tj| jdS )Nr/  r   rv   )r  r.  r   r*   r   r   r=   )	rB   rC   ri   rj   r$  r%  rd   r   
rstd_shaper   r   r   _gemm_rms_out_fakei  s   
r2  c                 C   sf   | j dkrtjntj}|| |}|dur|| }t|  jdd| }|dur/|| }||fS )z[Reference: D_raw = A @ B (+ C), rstd = rsqrt(mean(D_raw^2) + eps), D = D_raw * norm_weight.r   Nr   r   )r|   r*   r   r   rsqrtr   r   mean)rB   rC   rj   r$  r%  r   r   r-  r   r   r   gemm_rms_ref  s   
r5  c	              
   C   sn   |du r| j n|}|jd }	|du r'g | jdd |	R }
tj|
|| jd}t| |||||||d}||fS )zGEMM + RMS statistics + optional rowvec scaling.

    D_raw = A @ B (+ C), rstd = rsqrt(mean(D_raw^2) + eps), D_out = D_raw * norm_weight.
    Returns (D_out, rstd).
    Nr   rv   )rj   r$  r%  rd   r   )rw   r   r*   r   r=   r0  )rB   rC   rj   r$  ri   r   r%  rd   r   r)  r   r-  r   r   r   gemm_rms  s    

r6  c                 C   r^   r_   r`   ra   r   r   r   rN     rc   r-  c	                 C     |d u r	t | j}| jdkr| d} |j}|jdkr |d}|d ur.|jdkr.|d}|d ur=|jdkr=|d}	n|}	|jdkrJ|d}
n|}
|d urZ|jdkrZ|d}|p^|j}|rst| d dkrstjdtj	| jdnd }t
|js{| n||js|n| |	d ur|js|	n|	jnd |d ur|js|n|jnd |js|
n|
j|||j|j|j|j|jd||j|js|nd |jr|nd d d S Nr5   r   r(   rV   rv   T)rx   r;   ry   colvecr&  r>   r=   r|   r}   r~   r;   r   r*   r   r   gemm_norm_act_sm90_sm100rT   r6   r7   r8   r9   r:   ry   rB   rC   r   r   rj   r-  r   rd   rH   r   r   r   r   r   r   gemm_norm_act_tuned  V   









r=  c                 C   r^   r_   r`   ra   r   r   r   rN     rc   c	                 C   r7  r8  r:  r<  r   r   r   gemm_norm_gated_tuned  r>  r?  zquack::gemm_norm_act_outz(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? rstd=None, str? activation=None, bool dynamic_scheduler=False, bool tuned=True) -> ()c	           
   	   C   0   |rt ntt jd d}	|	| ||||||| d S NrG   )r=  r   r   
rB   rC   r   r   rj   r-  r   rd   r   r   r   r   r   gemm_norm_act_out5     rC  c	           	      C      d S r   r   	rB   rC   r   r   rj   r-  r   rd   r   r   r   r   _gemm_norm_act_out_fakeJ     rG  zquack::gemm_norm_gated_outz(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? rstd=None, str activation='swiglu', bool dynamic_scheduler=False, bool tuned=True) -> ()c	           
   	   C   r@  rA  )r?  r   r   rB  r   r   r   gemm_norm_gated_outY  rD  rI  c	           	      C   rE  r   r   rF  r   r   r   _gemm_norm_gated_out_faken  rH  rJ  c              
   C   s
  |t v }|du r| jn|}|du r| jn|}| jdkr&| jd |jd f}n| jd | jd |jd f}|rEg |dd |d d R n|}|du rV|	rVtj||| jd}|du rctj||| jd}|rut| |||||||
|	 ||fS t| |||||||
|	 ||fS )zGEMM + normalize + activation: PostAct = act((A @ B + C) * rstd).

    rstd is a column vector (M,).
    Returns (preact, postact) where preact is the normalized value before activation.
    Nr5   r   r   ru   rv   )	r   rw   r|   r   r*   r   r=   rI  rC  )rB   rC   r-  rj   r   r   r   r   r   r   rd   r   r   r   r   r   r   r   gemm_norm_act}  sH   
&rK  c                 C   s   |t v }|du r| jn|}|du r| jn|}| jdkrtjntj}	|	| |}
|dur.|
| }
|dur9|
|d }
|r@|
|nd}i tdt	j
i}|rm|
ddddf }|
ddddf }t | |||}||fS || |
|}||fS )z>Reference: preact = (A @ B + C) * rstd, postact = act(preact).Nr   r   r$   .r5   r(   )r   rw   r|   r*   r   r   r}   r   r   r   r$   )rB   rC   r-  rj   r   r   r   r   r   r   r   r   _act_mapr&   r'   r   r   r   r   gemm_norm_act_ref  s$   
rM  )NNNNNFN)NNNTN)NNrh   NNNN)
Nrh   rh   NNNNNFT)rh   rh   NNNNNNFFT)NNrh   rh   NNNN)rh   rh   NNNNFT)
rh   rh   NNNNNNFT)NNNNNNNNNTFT)NNNNNFT)NNNNNNNT)NNNNNNFNNTT)NNNTT)NNNNN)NFrh   rh   )NNNFrh   rh   )NNr-   NNFN)Nr-   FNNTN)NNr-   NNFT)Nr-   FNNTTr   )NNr#  FN)NNr#  FT)NNr#  )NNNNr#  FT)NNNFN)NNr-   FN)NNNFT)rs   N)NNr-   FT)
NNNNNNNFFT)NNNFNN)mtypingr   r   r   	functoolsr   r*   torch.nn.functionalnn
functionalr   r   quack.gemm_configr   r   quack.autotunerr	   r
   quack.cute_dsl_utilsr   
quack.gemmr   r   quack.gemm_actr   r   quack.gemm_dactr   r   quack.gemm_symmetricr   r   quack.gemm_sq_reducer   r(  quack.gemm_norm_actr   r;  quack.rms_final_reducer   quack.roundingr   r   r,   r   r   ActActivationGatedActivation
Activationr>   rE   r  r]   RNr   boolr   r   r   r   rw   libraryr  r   r   r   r   r   r   r   
gemm_gatedr   r   gemm_gated_refgemm_dgatedr   r   gemm_dgated_refr   r   r   r  r   r   r  strr  r  r
  r  r  r   r!  r"  r.  r0  r2  r5  r6  r=  r?  rC  rG  rI  rJ  rK  gemm_norm_gatedrM  gemm_norm_gated_refr   r   r   r   <module>   s~
  	
	
`	
B	
@	

:	
+	

?	

;	
+	

H	

/	
(	

A	
	

	

G	

	
(
.	
	
B
X	
	

"	
)	

#



'		@
	
	

	


%
	
9
	
9	


	


	

:	
