o
    3/ib                     @   s  d dl Z d dlmZmZmZ d dlmZ d dlm  m	Z
 d dlZd dlmZ d dlmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	l m!Z! d d
l"m#Z# G dd deZ$ej%j&ddddd							dZdedee dedee dee dee dee dee de'de(ddfdd Z)e)j*							dZdedee dedee dee dee dee dee de'de(ddfd!d"Z+e!d#d$ Z,							dZdedee dee dee d%eej- d&eej- de'd'e(deeeee f fd(d)Z.d[d*d+Z/d\d,d-Z0G d.d/ d/eZ1d0e2d1ej3de2fd2d3Z4ej%j&d4h d5dd6d				d]dedee d7eded8ed9ee d:ee d;ee d<ee d=ee2 ddfd>d?Z5e5j*				d]dedee d7eded8ed9ee d:ee d;ee d<ee d=ee2 ddfd@dAZ6e!dBdC Z7			d^dedee d7eded;ee dDe(dEe(deeee ee ee f fdFdGZ8G dHdI dIej9j:Z;							dZdedee dee dee d%eej- d&eej- de'dJe(defdKdLZ<G dMdN dNej=j$Z>				d_dededee de'dOe(dPe(fdQdRZ?d\dedSede'defdTdUZ@d\dejde'fdVdWZAdejdejfdXdYZBdS )`    N)OptionalTupleType)partial)Float32Int32
const_expr)Tensor)make_fake_tensor)
row_reduce)ReductionBase)	jit_cache)torch2cute_dtype_mapc                       s  e Zd Zddeej dedef fddZdd Z	d	d
 Z
ejdejdeej deej deej dejdeej deej deej dedejfddZejdejdeej deej deej dejdeej deej deej dedejdejdeje fddZ  ZS )RMSNormFdtypeNis_layernormc                    sD   t  j|||r	dndd || _||rdndkrd nd| _d| _d S )N      )stage @      smemF)super__init__r   reload_fromdelay_w_load)selfr   r   r   	__class__ Z/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/rmsnorm.pyr      s   
zRMSNorm.__init__c                 C   (   | j }dD ]\}}||kr|  S qdS )N)@            )i       )i   r$   )r   r'      r   r   r   limitthreadsr    r    r!   _threads_per_row!      zRMSNorm._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr(   ))r   r   )   r   )      )   r%   ))r1   r   )r2   r   )r4   r3   )i   r%   )r   r   r   width	cluster_n)r   r   
thresholdsr-   clusterr    r    r!   _set_cluster_n(   s   

zRMSNorm._set_cluster_nmXmWmBmResmOmResOmRstdmMeanepsstreamc                    s  |j  jksJ    ttdd ||||||fD  }t jd| } j|d\}}|j	}fdd||fD \}} fdd||fD \}} 
|||||||||	||jt|jd d  jd	g|d	d	gt jd	kr{d	 jd	gnd |
d
 d S )Nc                 s        | ]}|d ur|j jV  qd S Nelement_typer5   .0tr    r    r!   	<genexpr>G       z#RMSNorm.__call__.<locals>.<genexpr>r'   vecsizec                    s0   g | ]}t |d urtj|d d dnd qS )Nr   dimsize)r   layout_utilsexpandrI   mT)tiler_mnr    r!   
<listcomp>L   s    "z$RMSNorm.__call__.<locals>.<listcomp>c                    s.   g | ]}t |d urtj|d jdnd qS )Nr   rO   )r   rR   rS   r   rT   r   r    r!   rW   P        r   r   gridblockr8   rC   )rG   r   r9   r   maxmathgcdr   _get_tiled_copyrQ   kernellaunchcuteceil_divshaper6   )r   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   largest_dtype_widthrN   
tiled_copythreads_per_rownum_threadsr    )r   rV   r!   __call__6   s.   




zRMSNorm.__call__rV   rg   rh   c           ?   
      s  t j \}}}t j \ }}t| jdkrtdnt j d |j}tj	 }|j
|jt jdddd}t|d urL|j
|jt jdddd}| ||\}}|j}t |} fdd|||||||fD \}}}}}}}fd	d||fD \}}||} t|d ur| |nd }!t|d ur| |nd }"| |}#| |}$t|d ur| |}%| |}&| |}'t|d ur| |}(t|d ur| |nd })t|d ur| |nd }*| |d
 }+t|d urt |!nd },t|d urt |"nd }-dd |#|'fD \}.}/t|d urt |%}0t |t jj }1| |||1 t|d d | j k}2|2sKtj| ||d dnd }3ttj|3d}4|+d d }5|5|d k rv|4|#|$dd t|d urv|4|%|&dd t j  t| j rt|d ur|4|!|, t|d ur|4|"|- t jd t |$|. |.  t j!}6t|d urt |&|0 |6|0  t j!7 }6t|d urt |(}7|7"|6 |7j |5|d k r|4|7|( d\}8}9t| j#rt$|6t j%j&||d t| jdkr|d nd dt| jdkrt jj'nd d}:|:|d  }8t|d urF|+d d dkrF|5|d k rF| jdksBt j( dkrF|8|*d< t| j)dkrtt |$|. |.  t j!}6t|d urst |&|0 |6|0  t j!7 }6n+t| j)dkr|4|#|. |.  t j!}6t|d ur|4|%|0 |6|0  t j!7 }6t$|6|8 |6|8  t j%j&||d t| jdkr|d nd dd};t j*j+|;|d  |	 dd}9n/td}8t$|6|6 t j%j&||d |dt| jdkrt jj'nd d}<t j*j+|<|d  |	 dd}9t|d ur&|+d d dkr&|5|d k r&| jdks"t j( dkr&|9|)d< t| jrDt|d ur8|4|!|, t|d urD|4|"|- t| j)dkpO| j)dkrt| j)dkrnt |$|. t|d urmt |&|0 n|4|#|. t|d ur|4|%|0 |.  t j!}6t|d ur|6|0  t j!7 }6t| j#r|6|8 |9 n|6|9 }=|=}>t|d ur|>|,  t j!9 }>t|d ur|>|-  t j!7 }>|/"|> |/j |5|d k r|4|/|' d S d S )Nr   r   r   r   orderr(   byte_alignmentc                    s*   g | ]}|d urt | fnd qS rE   rc   
local_tilerT   bidx	cluster_yrV   r    r!   rW          z"RMSNorm.kernel.<locals>.<listcomp>c                    s.   g | ]}t |d urt|d fnd qS )Nr   r   rc   rq   rT   rt   rV   r    r!   rW      rY   )r   NNNc                 S   s   g | ]}t |qS r    rc   make_rmem_tensor_likerH   r    r    r!   rW      s    r-   predTis_asyncNNNNr           )init_valhook_fnr   gmem)NNr   )r   )fastmath),rc   arch
thread_idx	block_idxr   r6   layout_tv_tiledcutlassutilsSmemAllocatorallocate_tensorrG   make_ordered_layout#_allocate_reduction_buffer_and_mbarre   make_identity_tensor	get_slicepartition_Spartition_Drz   rQ   	WARP_SIZE_initialize_cluster
copy_utilspredicate_kr   copycp_async_commit_groupr   cp_async_wait_groupautovec_copyloadtor   storer   r   ReductionOpADDcluster_waitblock_idx_in_clusterr   r^   rsqrt)?r   r:   r;   r<   r=   r>   r?   r@   rA   rB   rV   rg   rh   tidx_	tv_layoutr   sXsResreduction_buffermbar_ptrre   idXgXgResgOgResOgRstdgMeancXgWgB
thr_copy_XtXgWtXgBtXgXtXsXtXgRestXsRestXgOtXgResOtXrRstdtXrMeantXcXtXrWtXrBtXrXtXrOtXrRes	num_warps	is_even_NtXpXr   rowxtXrResOmeanrstdsum_xsum_sq_x_sub_meansum_sq_xx_hatyr    rr   r!   ra   ]   s4  $















	

	



 zRMSNorm.kernel)F)__name__
__module____qualname__r   r   Numericintboolr   r/   r9   rc   jitr	   r   r   cudaCUstreamrj   ra   Shape	TiledCopy	Constexpr__classcell__r    r    r   r!   r      sh    "	
&	
r   zquack::_rmsnorm_fwd)outr   r   residual_outr   z(Tensor x, Tensor? weight, Tensor(a2!) out, Tensor? bias, Tensor(a4!)? rstd, Tensor(a5!)? mean, Tensor? residual, Tensor(a7!)? residual_out, float eps=1e-6, bool is_layernorm=False) -> ())mutates_argsdevice_typesschemaư>Fr   weightr   biasr   r   residualr   rB   r   returnc
                 C   s   t jt jt jh}
| j|
v sJ d|dur|j|
v sJ d|dur+|j|
v s+J d| j\}}dd | |||||fD \}}}}}}t||||||||du|du|	
| ||||||||	 dS )aA  RMSNorm/LayerNorm forward pass.
    Args:
        x: Input tensor of shape (M, N)
        weight: Optional weight tensor of shape (N,)
        eps: Small value for numerical stability
        is_layernorm: If True, compute LayerNorm instead of RMSNorm
    Returns:
        Normalized output tensor of same shape as x
    Unsupported dtypeN+Weight must be float32, float16 or bfloat16.Residual must be float16, bfloat16, or float32c                 S   "   g | ]}|d urt |j nd qS rE   r   r   rH   r    r    r!   rW   F      z _rmsnorm_fwd.<locals>.<listcomp>)torchfloat16bfloat16float32r   re   _compile_rmsnorm_fwd)r   r   r   r   r   r   r   r   rB   r   supported_typesr   r   r   	out_dtypeweight_dtype
bias_dtype	res_dtyperes_out_dtyper    r    r!   _rmsnorm_fwd!  s0   
r   c
                 C   s   ddl m}
 |
rLt| dtjsN| d}dd | |||||fD \}}}}}}t||||||||d u|d u|	
 t||||||d u|||d u	 d S d S d S )Nr   COMPILE_ONLYr   c                 S   r   rE   r   rH   r    r    r!   rW   j  r   z%_rmsnorm_fwd_fake.<locals>.<listcomp>)quack.cache_utilsr   
isinstancerQ   r   SymIntr   _compile_rmsnorm_bwd)r   r   r   r   r   r   r   r   rB   r   r   r   r   r   r   r   r   r   r    r    r!   _rmsnorm_fwd_fakeX  s<   
r  c
                    s   t  | |||||g}
tj gdd |
D R   fdd| |||fD \}}}} fdd||fD \}}|rCttfnd }|rMttfnd }t jt|  |	d||||||||tdt jj	dd	d
dS )Nc                 s   "    | ]}|d urd|j  V  qd S Nr'   r5   rI   dtr    r    r!   rK          z'_compile_rmsnorm_fwd.<locals>.<genexpr>c                       g | ]
}t | fqS r    fake_tensorr  r   	batch_symdivr    r!   rW     s    z(_compile_rmsnorm_fwd.<locals>.<listcomp>c                    s   g | ]	}t | fqS r    r  r  )r   r  r    r!   rW     s    )r   r   Tuse_tvm_ffi_env_stream--enable-tvm-ffioptions)
rc   sym_intr^   r_   r  r   compiler   runtimemake_fake_stream)r   r   r   r   r   r   r   has_rstdhas_meanr   
all_dtypesx_cuteout_cuteres_cuteres_out_cuteweight_cute	bias_cute	rstd_cute	mean_cuter    r  r!   r     s.   
r   r   residual_dtype
store_rstdc                 C   s   |d u r| j n|}tj| |d}|rtj| jd | jtjdnd }	|d ur(|j }|d us5|d urD|| j krDtj| |d ur>|n| j d}
nd }
t| ||||	d ||
|d
 |
d u rY| }
||
|	fS )Nr   r   devicer   F)r   r   
empty_likeemptyre   r(  r   r   )r   r   r   r   r   r$  rB   r%  r   r   r   r    r    r!   rmsnorm_fwd  s   "
r+  c           	      C   s   |   }|d ur|  }||7 }|ttj| ddd|  }|d ur)|| n|}|d ur5||   }|d u r?|| jS || j||jfS )NTrP   keepdim)floatr   sqrtr   squarer   r   )	r   wr   r   rB   x_f32residual_f32x_normr   r    r    r!   rmsnorm_ref  s   "r6  c                 C   s   |   }||d }|dur|| }n|}|| jddd}|||  |d }	|durB|| jdd}
|	| j|
|jfS |	| jdfS )z3Reference implementation for RMSNorm backward pass.r   Nr,  Tr-  r   rP   )r/  	unsqueezer   sumr   r   )r   r2  doutr   rB   r3  r   wdyc1dxdwr    r    r!   rmsnorm_bwd_ref  s   
r?  c                       s  e Zd Zdejdef fddZdd Zdd Zd	d
 Z	e
jde
jdee
j de
jdee
j de
jde
jdee
j dee
j dee
j dedejfddZe
jde
jdee
j de
jdee
j de
jde
jdee
j dee
j dee
j de
jde
jdeje fddZ  ZS )RMSNormBackwardr   r   c                    sL   t  j||dtd |dkrd nd| _| jdkr"| jjdkr$tdd S d S )Nr   )r   reduction_dtyper   r   r4   r)   z?RMSNormBackward does not support N > 128k with dtype >= 32 bits)r   r   r   
reload_wdyr   r   r5   
ValueError)r   r   r   r   r    r!   r     s
   zRMSNormBackward.__init__c                 C   s   | j dkrdS dS )N   r'   r*   r+   rX   r    r    r!   _num_threads  s   zRMSNormBackward._num_threadsc                 C   r"   )N)r#   r&   )r*   r)   )i   r$   )rD  r'   r*   r+   r,   r    r    r!   r/     r0   z RMSNormBackward._threads_per_rowc                 C   s2   | j }dD ]\}}||kr|| _ d S qd| _d S )N))r   r   )r   r   )r1   r3   )r2   r%   r(   )r   r6   )r   r   r-   r8   r    r    r!   r9     s   
zRMSNormBackward._set_cluster_nr:   r;   mdOmdResOr@   mdXmdWmdResmdBsm_countrC   c                 C   s   |j | jksJ |   ttdd ||||||fD  }t| jd| }| j|d\}}}|j	}t|d urBt
j|d|d dnd }|
}| ||||||||	||||j|| jdg|ddg| jdkrjd| jdgnd |d d S )	Nc                 s   rD   rE   rF   rH   r    r    r!   rK     rL   z+RMSNormBackward.__call__.<locals>.<genexpr>r'   rM   r   rO   r   rZ   )rG   r   r9   r   r]   r^   r_   r   r`   rQ   rR   rS   ra   rb   r6   )r   r:   r;   rF  rG  r@   rH  rI  rJ  rK  rL  rC   rf   rN   rg   rV   rh   ri   
num_blocksr    r    r!   rj   	  s(   "

zRMSNormBackward.__call__rV   rg   rh   c           T   
      s  t j \}}}t j \ }}t j \}}}t| jdkr#tdnt j d |j}|j}|d |d }}t|d d | j k}t 	|}t
j }t jd d dfdd}|j|j|dd}|j|j|dd}| j||dd	\}}t|d ur||d }}nd
\}}||}fdd|||||	|fD \}} }!}"}#}$|d urt |dfnd }% fdd||fD \}&}'||}(||})|| }*||}+||"},t|d ur||!}-t|	d ur||#}.||$d }/dd |(|*|,fD \}0}1}2d }3t|d urt |-d }3d }4t|	d ur(t |.d }4|r-d ntj||$d |d d}5ttj|5d}6d
\}7}8d
\}9}:t|d ur\||&}7t |7t}8t|d urn||'}9t |9t}:t |t jj };| j|||;dd	 d }<t|d ur||%}=t |=}<t| r|<d |6|=|< |/d d d  f d d }>|>|k r|6|(d d d  f |)d dd |6|*d d d  f |+d dd n!td dkrtj|)d d |jj d tj|+d d |jj d t j!  t| jdkrt j"  t|d ur|8d t|d ur|:d t#d}?t#d}@t#d}At
$ t %|d |D ]}B|/d d d |Bf d d }>|>|d   |k r|6|(d d d |B| f |)d d d |?dA f dd |6|*d d d |B| f |+d d d |?dA f dd n-td dkrtj|)d d d |?dA f d |jj d tj|+d d d |?dA f d |jj d t j!  t
j&j }C|>|k sd dkr||> }Ct|d ur|>|k sوd dkr|6|-d d d |Bf |3 nd dkr|3d t j'd t (|)d d d |?f |0 |0) *t j}Dt (|+d d d |?f |1 |1) *t j}E|D|C }F|E}Gt|d ur5|G|<) *t9 }Gt| jdkrFt j+||? |@ t,|F|G t j-j.||d d |?f t| jdkr`||? nd |Add|d  }Ht| jdkrt j/  t j0  t j1 }I|I| jk rt jj2||? |Id t| j3dkrt (|+d d d |?f |1 |1) *t j}E|E}Gt|d ur|G|<) *t9 }G|G|F|H  |C }Jt|d ur|J|3) *t j7 }J|24|J*|2j |>|k sd dkr|6|2|,d d d |Bf  t|	d ur|44|J*|4j |>|k sd dkr|6|4|.d d d |Bf  t|d ur1|84|8) |E|F   t|d urA|:4|:) |E  |?dN }?|?dkrR|AdN }A|@dN }@q7td dkrSt|d urt 5t j6|j7t jdt jdd}K||K}Lt j8  |/d d d }>|>dkrt (|8|L t j8  |>dkrt
9dtd D ](}Mt |8}Nt 5|Lj7|M|Kj:d   |Lj;}Ot (|O|N |84|8) |N)   q|6|8|7 t j8  t|d urRt 5t j6|j7t jdt jdd}P||P}Qt j8  |/d d d }>|>dkrt (|:|Q t j8  |>dkrRt
9dtd D ](}Mt |:}Rt 5|Qj7|M|Pj:d   |Qj;}St (|S|R |:4|:) |R)   q$|6|:|9 nt|d ur_|6|8|7 t|d urk|6|:|9 t| jdkr|?dN }?|?dkr|@dN }@t j+||? |@ d S d S )Nr   r   r   )r   r   r   rl   r(   rn   T)is_persistentr   c                    s*   g | ]}|d urt |d  fnd qS rE   rp   rT   rw   r    r!   rW   W  ru   z*RMSNormBackward.kernel.<locals>.<listcomp>c                    s6   g | ]}t |d urt|dd f fnd qS )Nr   rv   rT   
bidx_startrt   rV   r    r!   rW   \  s    
)rx   NNNc                 S   s   g | ]	}t |d  qS )NNNr   ry   )rI   thrr    r    r!   rW   n  s    rQ  r   r{   r|   r   r~   )
fill_value)phaser   )peer_cta_rank_in_clusterr   r&  rk   )<rc   r   r   r   grid_dimr   r6   r   re   r   r   r   r   r   r   rG   r   r   rq   r   r   rz   r   r   r   r   r   rQ   r   r   fillfill_oobzeror   r   r   rangerd   Floatr   r   r   r   mbarrier_waitr   r   r   fence_view_async_shared	sync_warplane_idxmbarrier_arriverB  r   make_tensor
recast_ptriteratorbarrierrange_constexprstridelayout)Tr   r:   r;   rF  rG  r@   rH  rI  rK  rJ  rV   rg   rh   r   r   gdimr   re   Mr   r   r   r   smem_layoutr   sdOr   r   mbar_full_ptrmbar_empty_ptrr   r   gdOgdResOgdXgdResr   r   gdWgdBr   r   tXgdOtXsdOtXgdXtXgdResOtXgdResr   r   tXrdOtXrdXtXrdResOtXrdResr   r   tXgdWtXrdWtXgdBtXrdBr   r   r   r   r   producer_phaseconsumer_phasers   r   r   r:  r   r;  mean_xhat_wdyr_  r=  sdWtXsdWitXrdW_othertXsdW_othersdBtXsdBtXrdB_othertXsdB_otherr    rO  r!   ra   -  s  $


















 



 

	





















zRMSNormBackward.kernel)r   r   r   r   r   r   r   rE  r/   r9   rc   r   r	   r   r   r   r   rj   ra   r   r   r   r   r    r    r   r!   r@    sn    	
#	
r@  r   r(  c                 C   sv   | dkrdn| dkrdn| dkrdn| dkrdnd	}t j|j}| d
kr+|| }|S | dkr5|d }|S |d }|S )Nr*   r(   i   r%   i   r3   rD  r   r   r   r   )r   r   get_device_propertiesmulti_processor_count)r   r(  sm_count_multiplerL  r    r    r!   _get_sm_countH  s   2r  zquack::_rmsnorm_bwd>   r=  	dresidual
db_partial
dw_partialz(Tensor x, Tensor? weight, Tensor dout, Tensor rstd, Tensor(a4!) dx, Tensor(a5!)? dw_partial, Tensor(a6!)? db_partial, Tensor? dresidual_out, Tensor(a8!)? dresidual, int? sm_count) -> ()r:  r=  r  r  dresidual_outr  rL  c
                 C   s  |   dks
J d| jsJ dtjtjtjh}
| j|
v s"J d|durN|  dks0J d| jd |jd	 ks>J d
|jsEJ d|j|
v sNJ d|durh|j| jksZJ |js_J |j|
v shJ d|dur|j| jkstJ |jsyJ |j|
v sJ d| d}|du r|du r|	dusJ n|dur|jd	 n|jd	 }	dd | |||||fD \}}}}}}t	||||||du|||du	| |||||||||	
 dS )a  RMSNorm backward pass.
    Args:
        x: Input tensor of shape (M, N)
        weight: Optional weight tensor of shape (N,)
        dout: Upstream gradients tensor of shape (M, N)
        rstd: Reciprocal standard deviation tensor of shape (M,)
    Returns:
        Tuple of (dx, dw) where:
        - dx: Input gradients tensor of same shape as x
        - dw: Weight gradients tensor of same shape as weight (or None if weight is None)
    r   Input must be 2Dz#Input tensor must be on CUDA devicer   Nr   Weight must be 1Dr,  r   z3Last dimension of input must match weight dimensionz$Weight tensor must be on CUDA devicer   r   c                 S   r   rE   r   rH   r    r    r!   rW     r   z _rmsnorm_bwd.<locals>.<listcomp>)
rP   is_cudar   r   r   r   r   re   rQ   r  )r   r   r:  r   r=  r  r  r  r  rL  r   r   r   
dout_dtypedx_dtyper   
dres_dtypedres_out_dtyper    r    r!   _rmsnorm_bwdY  sN   



r  c
                 C   s   ddl m}
 |
rIt| dtjsK| d}|d u r$|d u r$|	d u r$d S dd | |||||fD \}}}}}}t||||||d u|||d u	 d S d S d S )Nr   r   r   c                 S   r   rE   r   rH   r    r    r!   rW     r   z%_rmsnorm_bwd_fake.<locals>.<listcomp>)r   r   r   rQ   r   r  r  )r   r   r:  r   r=  r  r  r  r  rL  r   r   r   r  r  r   r  r  r    r    r!   _rmsnorm_bwd_fake  s(   
r  c	                    s   t  t  }	|||||g}
tj gdd |
D R   fdd|||||fD \}}}}}t| f}ttf}|rJtt|	 fnd }|rVtt|	 fnd }t jt| |||||||||dt jj	dddd	S )
Nc                 s   r  r  r  r  r    r    r!   rK     r	  z'_compile_rmsnorm_bwd.<locals>.<genexpr>c                    r
  r    r  r  r  r    r!   rW     s    z(_compile_rmsnorm_bwd.<locals>.<listcomp>r   Tr  r  r  )
rc   r  r^   r_   r  r   r  r@  r  r  )r   r   r  r  r   has_db_partialr  r  has_dw_partialbatch_partial_symr  r  	dout_cutedx_cutedres_out_cute	dres_cuter   r"  dw_partial_cutedb_partial_cuter    r  r!   r    s2   r  has_biashas_residualc                 C   s   | j }| d}t| }	|d ur |j|	jkr tj| |jd}
nd }
t||}|d ur6tj|||tjd}nd }|rDtj|||tjdnd }t| ||||	||||
|
 |d ura|j	dd
|jnd }|ro|j	dd
|jnd }|ry|
d u ry|	}
|	|||
fS )Nr   r&  r'  r   r7  )r(  rQ   r   r)  r   r  r*  r   r  r9  r   )r   r   r:  r   r  r  r  r(  r   r=  r  rL  r  r  r>  dbr    r    r!   rmsnorm_bwd  s&   	


 r  c                   @   s2   e Zd Ze						dddZedd ZdS )	RMSNormFunctionNr   Fc	              
   C   s   |j }	|d|j d }|d ur|d|j d }t| jd d }
t||||||||
d\}}}| |d u r9|n||| |d u| _|| _|	| _|d urP|j	nd | _
|| _|d u s\|sa||	S ||	||	fS )Nr,     )r   r   r   r$  rB   r%  )re   reshapeanyneeds_input_gradr+  save_for_backwardr  rB   
x_shape_ogr   r$  prenorm)ctxr   r   r   r   r   r$  rB   r  r  	need_gradr   r   r   r    r    r!   forward  s0   


zRMSNormFunction.forwardc              	   G   s   | j \}}}| j}| jr| jd ur|d }|d|jd }nd }| j}|d|jd }t||||||| jd ud\}	}
}}|	|}	|d urM||}|	|
||gd gd R S )Nr   r,  )r  r3   )	saved_tensorsr  r  r$  r  re   r  viewr  )r  r:  argsr   r   r   r  r  r  r=  r>  r  r  r    r    r!   backward;  s*   
	
zRMSNormFunction.backward)NNNNr   F)r   r   r   staticmethodr  r  r    r    r    r!   r    s    &r  r  c              
   C   s   t | |||||||S )a  RMSNorm with automatic differentiation support.

    Args:
        x: Input tensor of shape (M, N)
        weight: Optional weight tensor of shape (N,)
        eps: Small value for numerical stability

    Returns:
        Normalized output tensor of same shape as x
    )r  apply)r   r   r   r   r   r$  rB   r  r    r    r!   rmsnormW  s   r  c                       sD   e Zd ZdZ	ddededef fdd	Zd
edefddZ	  Z
S )QuackRMSNorma  RMSNorm module that behaves like torch.nn.RMSNorm.

    This class provides a drop-in replacement for torch.nn.RMSNorm that uses
    the quack.rmsnorm implementation under the hood.

    Args:
        dim (int): The dimension to normalize over
        eps (float, optional): A small constant for numerical stability. Default: 1e-6

    Attributes:
        weight (torch.nn.Parameter): The learnable weight parameter
        eps (float): A small constant for numerical stability
    r   TNrP   rB   elementwise_affinec                    s   t  j|||||d d S )Nr'  )r   r   )r   rP   rB   r  r(  r   r   r    r!   r   }  s   zQuackRMSNorm.__init__r   r   c                 C   s   t || j| jdS )zApply RMSNorm to the input tensor.

        Args:
            x (Tensor): Input tensor

        Returns:
            Tensor: Normalized tensor
        )rB   )r  r   rB   )r   r   r    r    r!   r    s   	zQuackRMSNorm.forward)r   TNN)r   r   r   __doc__r   r/  r   r   r	   r  r   r    r    r   r!   r  n  s    r  return_rstdreturn_meanc                 C   s   |   dks
J d|  dksJ d| jtjtjtjfv s#J d|jtjks-J d|durE|  dks;J d|jtjksEJ d	| j\}}| j}t| }	|r]tj	||tjd
nd}
|rjtj	||tjd
nd}t
| ||	||
|dd|d
 |r|r|	|
|fS |r|	|
fS |r|	|fS |	S )au  LayerNorm forward pass using the unified RMSNorm/LayerNorm kernel.

    Args:
        x: Input tensor of shape (M, N)
        weight: Weight tensor of shape (N,). Must be float32.
        bias: Optional bias tensor of shape (N,). Must be float32.
        eps: Small value for numerical stability
        return_rstd: Whether to return the reciprocal standard deviation
        return_mean: Whether to return the mean

    Returns:
        Normalized output tensor of same shape as x
        If return_rstd is True, also returns rstd tensor of shape (M,)
        If return_mean is True, also returns mean tensor of shape (M,)
    r   r  r   r  r   zWeight must be float32NzBias must be 1DzBias must be float32r'  T)rP   r   r   r   r   r   re   r(  r)  r*  r   )r   r   r   rB   r  r  ri  r   r(  r   r   r   r    r    r!   layernorm_fwd  s(   


r  r2  c                 C   s(   |   }tjj||j|d|| jS )z'Reference implementation for LayerNorm.N)r/  r   nn
functional
layer_normre   r   r   )r   r2  rB   r3  r    r    r!   layernorm_ref  s    r  c                 C   s<   |   }|jddd}|| d jdd}dt||  S )Nr,  Tr-  r   r7  g      ?)r/  r   r   r0  )r   rB   r3  r   varr    r    r!   layernorm_rstd_ref  s   r  c                 C   s   |   jddS )Nr,  r7  )r/  r   )r   r    r    r!   layernorm_mean_ref  s   r  )NNNNNr   F)NNNr   )r   )NNNN)NFF)Nr   FF)Cr^   typingr   r   r   	functoolsr   cuda.bindings.driverbindingsdriverr   r   cutlass.cuterc   r   r   r   r   r	   quack.utilsr   quack.copy_utilsr   quack.layout_utilsrR   quack.compile_utilsr
   r  quack.reducer   quack.reduction_baser   r   r   quack.cute_dsl_utilsr   r   library	custom_opr/  r   r   register_faker  r   r   r+  r6  r?  r@  r   r(  r  r  r  r  r  autogradFunctionr  r  r  r  r  r  r  r  r    r    r    r!   <module>   s    		
0	
.
'	


  \	
A	
$
,
%F	
#
0