o
    3/i5                     @   s"  d dl Z d dlmZ d dlm  mZ d dlZd dlm	Z	 d dlm
Z
mZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ G d
d deZedd Zejjdddddededededdf
ddZ e j!dd Z"	ddedededefddZ#dS )    N)Type)Float32
const_expr)Tensor)make_fake_tensor)
row_reduce)ReductionBase)	jit_cache)torch2cute_dtype_mapc                       s   e Zd ZdZdeej def fddZdd Z	dd	 Z
ejd
ejdejdededejf
ddZejd
ejdejdededejdejdeje fddZ  ZS )RmsFinalReducezReduce partial squared sums and compute rstd: rstd[m] = rsqrt(sum_n(x[m,n]) * scale + eps).

    Inherits from ReductionBase for tiled copy, reduction buffer, and cluster support.
    dtypeNc                    s   t  j||dd d S )N   )stage)super__init__)selfr   r   	__class__ c/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/rms_final_reduce.pyr       s   zRmsFinalReduce.__init__c                 C   s(   | j }dD ]\}}||kr|  S qdS )N))@      )      )i       )i   r   )i @  r      )r   )r   r   limitthreadsr   r   r   _threads_per_row#   s   zRmsFinalReduce._threads_per_rowc                 C   s
   d| _ d S )Nr   )	cluster_n)r   r   r   r   _set_cluster_n*   s   
zRmsFinalReduce._set_cluster_nmXmRstdscaleepsstreamc              	   C   s   |j | jksJ |   t| jd| jj }| j|d\}}}	|j}
| 	|||||||	j
t|jd |d ddg|
ddg|d d S )Nr   )vecsizer   r   )gridblockr&   )element_typer   r!   mathgcdr   width_get_tiled_copysizekernellaunchcuteceil_divshape)r   r"   r#   r$   r%   r&   r'   
tiled_copytiler_mnthreads_per_rownum_threadsr   r   r   __call__-   s   	
zRmsFinalReduce.__call__r6   r5   r7   c                 C   s~  t j \}}	}	t j \}
}	}	|j}tj }| ||\}}|j	}t 
|}t |||
df}t |||
df}||}||}||d }t |}t |d t|d |d k}|sptj|||d dnd }|d d }||d k rtj|||d | t}t|t jj||d |dd}t jj|| | d	d
}|d d dkr||d k r|||< d S d S d S )Nr   ))r   NNNr   )r   )pred)NNr   g        )init_valT)fastmath)r2   arch
thread_idx	block_idxlayout_tv_tiledcutlassutilsSmemAllocator#_allocate_reduction_buffer_and_mbarr4   make_identity_tensor
local_tile	get_slicepartition_Smake_rmem_tensor_likefilter_zerosfillr   
copy_utilspredicate_kcopyloadtor   r   ReductionOpADDr+   rsqrt)r   r"   r#   r$   r%   r6   r5   r7   tidx_bidx	tv_layoutsmemreduction_buffermbar_ptrr4   idXgXcXthr_copytXgXtXcXtXrX	is_even_NtXpXrowxsum_xrstdr   r   r   r0   A   sF   




zRmsFinalReduce.kernel)__name__
__module____qualname____doc__r   rA   Numericintr   r   r!   r2   jitr   r   cudaCUstreamr9   r0   Shape	TiledCopy	Constexpr__classcell__r   r   r   r   r      sB    r   c              	   C   sf   t  }t|d| j }t| ||f|}tt|f}t jt| |||tdtdt j	j
ddddS )Nr   r   T)use_tvm_ffi_env_streamz--enable-tvm-ffi)options)r2   sym_intr+   r,   r-   fake_tensorr   compiler   runtimemake_fake_stream)r   r   	batch_symdivx_cute	rstd_cuter   r   r   _compile_rms_final_reducex   s   r   zquack::rms_final_reduce_out)rg   ro   )mutates_argsdevice_typesre   rg   r$   r%   returnc                 C   s0   t | j }| jd }t||}|| ||| dS )6Compute rstd[m] = rsqrt(sum_n(x[m, n]) * scale + eps).r   N)r
   r   r4   r   )re   rg   r$   r%   x_dtyper   compiled_fnr   r   r   _rms_final_reduce_out   s   


r   c                 C   sH   ddl m} |r t| jd tjs"t| j }t|| jd  d S d S d S )Nr   COMPILE_ONLYr   )	quack.cache_utilsr   
isinstancer4   torchSymIntr
   r   r   )re   rg   r$   r%   r   r   r   r   r   _rms_final_reduce_out_fake   s
   
r   ư>c                 C   sR   | j dksJ | jd }tj|tj| jd}ddlm} |r |S t| ||| |S )r      r   )r   devicer   )	ndimr4   r   emptyfloat32r   r   r   r   )re   r$   r%   Mrg   r   r   r   r   rms_final_reduce   s   
r   )r   )$r+   typingr   cuda.bindings.driverbindingsdriverro   rA   cutlass.cuter2   r   r   r   r   quack.copy_utilsrL   quack.compile_utilsr   rx   quack.reducer   quack.reduction_baser   r   r	   quack.cute_dsl_utilsr
   r   r   library	custom_opfloatr   register_faker   r   r   r   r   r   <module>   sZ   ^

