o
    3/i<                  	   @   s  d dl Z d dlmZ d dlmZ d dlZd dlm  mZ	 d dl
Z
d dlmZ d dl
mZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ G d
d deZ edd Z!ej"j#ddhddej$dej$ddfddZ%e%j&dej$dej$ddfddZ'dej$dej$fddZ(G dd deZ)edd Z*ej"j#ddhddej$d ej$dej$ddfd!d"Z+e+j&dej$d ej$dej$ddfd#d$Z,dej$d ej$dej$fd%d&Z-G d'd( d(ej.j/Z0dej$dej$fd)d*Z1dS )+    N)Type)partial)Int64Float32
const_expr)make_fake_tensor)
row_reduceonline_softmax_reduce)ReductionBase)	jit_cache)torch2cute_dtype_mapc                       s   e Zd Zddeej dedef fddZdd Z	d	d
 Z
ejdejdejdejfddZejdejdejdejdejdeje f
ddZ  ZS )SoftmaxTdtypeNonline_softmaxc                    s.   t  j|||s	dnd|stntd || _d S )N      stagereduction_dtype)super__init__r   r   r   )selfr   r   r   	__class__ Z/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/softmax.pyr      s   


zSoftmax.__init__c                 C   (   | j }dD ]\}}||kr|  S qdS )N)@            i       i   r   ) @  r"      r   r   r   limitthreadsr   r   r   _threads_per_row#      zSoftmax._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr#   )r'   r   )   r   )      )   r    ))r0   r   )r1   r   )r3   r2   )i   r    r   r   r   width	cluster_nr   r   
thresholdsr+   clusterr   r   r   _set_cluster_n*      

zSoftmax._set_cluster_nmXmOstreamc           	      C   s   |j | jksJ |   ttdd ||fD }| jd| d\}}}|j}| |||||jt	
|jd |d | jdg|ddgt| jdkrOd| jdgnd |d d S )Nc                 s       | ]}|j jV  qd S Nelement_typer5   .0tr   r   r   	<genexpr>?       z#Softmax.__call__.<locals>.<genexpr>r"   vecsizer   r   gridblockr9   r>   rB   r   r:   r   max_get_tiled_copysizekernellaunchcuteceil_divshaper6   )	r   r<   r=   r>   largest_dtype_width
tiled_copytiler_mnthreads_per_rownum_threadsr   r   r   __call__6   s   
zSoftmax.__call__rX   rW   rY   c           #   
      s  |j }tj \}}}tj \ }}t| jdkrtdntj d |j}	t|	}
 fdd|||
fD \}}}t	j
 }|j|jtjdddd}| ||\}}||}||}||}||}||d	 }d
d ||fD \}}t|	d d | j k}|rd ntj|||	d d}ttj|d}t|tjj }| ||| |d d |	d k r|||dd tj  tjd t| rt
|||jj  t|| |  tj!}t| j" rKt#|tj$j%||d t| jdkr|d nd t!j t| jdkrtjj&nd d}t'(t'j)}tj'j*|| ||  dd} t#| tj$j+||d t| jdkrE|d nd dd}!nt,|||d |t| jdkr^tjj&nd dd\}}!} | tj-|! }"|.|" |j |d d |	d k r||| d S d S )Nr   r   c                       g | ]}t | fqS r   rS   
local_tilerD   mTbidx	cluster_yrX   r   r   
<listcomp>]   s    z"Softmax.kernel.<locals>.<listcomp>r   r   orderr#   byte_alignment)r   NNNc                 S      g | ]}t |qS r   rS   make_rmem_tensor_likerD   thrr   r   r   rd   k       r+   predTis_asyncNNr   init_valhook_fn)fastmath)NNr           )rx   )ry   return_exp_x)/layout_tv_tiledrS   arch
thread_idx	block_idxr   r6   rU   make_identity_tensorcutlassutilsSmemAllocatorallocate_tensorrB   make_ordered_layout#_allocate_reduction_buffer_and_mbar	get_slicepartition_Spartition_D
copy_utilspredicate_kr   copyrP   	WARP_SIZE_initialize_clustercp_async_commit_groupcp_async_wait_groupfill_oobinfautovec_copyloadtor   r   r   ReductionOpMAXcluster_waitmathlog2eexp2ADDr	   
rcp_approxstore)#r   r<   r=   rX   rW   rY   	tv_layouttidx_rU   idXgXgOcXsmemsXreduction_buffermbar_ptr
thr_copy_XtXgXtXsXtXgOtXcXtXrXtXrO	is_even_NtXpXr   	num_warpsxmax_xlog2_eexp_xdenomyr   ra   r   rQ   K   s   	$
"






			zSoftmax.kernel)T)__name__
__module____qualname__r   r   Numericintboolr   r-   r:   rS   jitTensorcudaCUstreamr[   rQ   Shape	TiledCopy	Constexpr__classcell__r   r   r   r   r      s0    "
r   c                    s`   t  td| j   fdd| |fD \}}t|  }t j|||t jjddddS )Nr"   c                       g | ]
}t | fqS r   fake_tensorrD   dtr   	batch_symdivr   r   rd      s    z(_compile_softmax_fwd.<locals>.<listcomp>Tuse_tvm_ffi_env_stream--enable-tvm-ffioptions)	rS   sym_intr   gcdr5   r   compileruntimemake_fake_stream)r   	out_dtyper   x_cuteout_cute
softmax_opr   r   r   _compile_softmax_fwd   s   
r   zquack::_softmax_fwdout)mutates_argsr   returnc                 C   sv   |   dks
J d| jsJ d| jtjtjtjfv s J d| d}dd | |fD \}}t|||| | dS )	zSoftmax forward pass.
    Args:
        x: Input tensor of shape (M, N)
    Returns:
        Softmax output tensor of same shape as x
    r   zInput must be 2DzTensor must be on CUDA deviceUnsupported dtyper   c                 S      g | ]}t |j qS r   r   r   rC   r   r   r   rd      rp   z _softmax_fwd.<locals>.<listcomp>N)	dimis_cudar   torchfloat16bfloat16float32rP   r   )r   r   r   r   r   r   r   r   _softmax_fwd   s   
r   c                 C   sh   ddl m} |r0t| dtjs2| d}dd | |fD \}}t||| t|||| d S d S d S )Nr   COMPILE_ONLYr   c                 S   r   r   r   rC   r   r   r   rd      rp   z%_softmax_fwd_fake.<locals>.<listcomp>)quack.cache_utilsr   
isinstancerP   r   SymIntr   _compile_softmax_backward)r   r   r   r   r   r   r   r   r   _softmax_fwd_fake   s   
r   c                 C   s   t | }t| | |S r@   )r   
empty_liker   )r   r   r   r   r   softmax_fwd   s   

r   c                       s   e Zd Zdeej def fddZdd Zdd Z	d	d
 Z
ejdejdejdejdejfddZejdejdejdejdejdejdeje fddZ  ZS )SoftmaxBackwardr   r   c                    s   t  j||dtd d S )Nr   r   )r   r   r   )r   r   r   r   r   r   r      s   zSoftmaxBackward.__init__c                 C   r   )N)r   r!   r$   r&   )    r"   r(   r)   r*   r   r   r   r-      r.   z SoftmaxBackward._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr#   r/   r4   r7   r   r   r   r:      r;   zSoftmaxBackward._set_cluster_nc                 C   s   | j dkrdS dS )Nr   r"   r(   r)   )r   r   r   r   _num_threads   s   zSoftmaxBackward._num_threadsmdYmYmdXr>   c           
      C   s   |j | jksJ |   ttdd |||fD }| jd| d\}}}|j}	| ||||||jt	
|jd |d | jdg|	ddgt| jdkrQd| jdgnd |d d S )Nc                 s   r?   r@   rA   rC   r   r   r   rF     rG   z+SoftmaxBackward.__call__.<locals>.<genexpr>r"   rH   r   r   rJ   rM   )
r   r   r   r   r>   rV   rW   rX   rY   rZ   r   r   r   r[      s   
zSoftmaxBackward.__call__rX   rW   rY   c           '   
      s  t j \}}}t j \ }}t| jdkrtdnt j d |j}	|j}
t |
} fdd||||fD \}}}}t	j
 }|j|jt jdddd}|j|jt jdddd}| ||	\}}||}||}||}||}||}||}||d	 }d
d |||fD \}}}t|
d d | j k}|rd ntj|||
d d} ttj| d}!t |t jj }"| |||" |d d |
d k r|!||dd |!||dd t j  t jd t || t || | t j}#| t j}$t |#|$ t j!j"||d t| jdkr#|nd dt| jdkr1t jj#nd d}%|$|#|%  }&|$|&|j |d d |
d k rV|!|| d S d S )Nr   r   c                    r\   r   r]   r_   ra   r   r   rd      s    z*SoftmaxBackward.kernel.<locals>.<listcomp>re   rf   r#   rh   rj   c                 S   rk   r   rl   rn   r   r   r   rd   5  rp   rq   rr   Trt   rv   r{   rw   )%rS   r~   r   r   r   r6   r}   rU   r   r   r   r   r   rB   r   r   r   r   r   r   r   r   r   rP   r   r   r   r   r   r   r   r   r   r   r   r   r   )'r   r   r   r   rX   rW   rY   r   r   r   rU   r   gdYgYgdXr   r   sdYsYr   r   thr_copytdYgdYtdYsdYtYgYtYsYtdXgdXr   tdYrdYtYrYtdXrdXr   r   r   r   dyr   dotdxr   ra   r   rQ     sl   
$









zSoftmaxBackward.kernel)r   r   r   r   r   r   r   r   r-   r:   r   rS   r   r   r   r   r[   rQ   r   r   r   r   r   r   r   r   r      s:    r   c                    sf   t  td| j   fdd| ||fD \}}}t|  }t j||||t jjddddS )Nr"   c                    r   r   r   r   r   r   r   rd   c  s    z-_compile_softmax_backward.<locals>.<listcomp>Tr   r   r   )	rS   r   r   r   r5   r   r   r   r   )r   y_dtypedx_dtyper   dy_cutey_cutedx_cutesoftmax_backward_opr   r   r   r   _  s   
r   zquack::_softmax_backwardr  r  r   c                 C   s   |   dks
J d|  dksJ d| j|jksJ d| jr$|js(J d| jtjtjtjfv s7J d|j| jksAJ d| d}d	d
 | ||fD \}}}t	||||| || dS )zSoftmax backward pass.
    Args:
        dy: Upstream gradients tensor of shape (M, N)
        y: Softmax output tensor of shape (M, N)
    Returns:
        Input gradients tensor of same shape as dy and y
    r   zdy must be 2Dzy must be 2Dzdy and y must have same shapezTensors must be on CUDA devicer   zdy and y must have same dtyper   c                 S   r   r   r   rC   r   r   r   rd     rp   z%_softmax_backward.<locals>.<listcomp>N)
r   rU   r   r   r   r   r   r   rP   r   )r  r   r  r   r   r  r  r   r   r   _softmax_backwardq  s   	
r  c                 C   s`   ddl m} |r,t| dtjs.| d}dd | ||fD \}}}t|||| d S d S d S )Nr   r   r   c                 S   r   r   r   rC   r   r   r   rd     rp   z*_softmax_backward_fake.<locals>.<listcomp>)r   r   r   rP   r   r   r   )r  r   r  r   r   r   r  r  r   r   r   _softmax_backward_fake  s   
r  c                 C   s   t | }t| || |S r@   )r   r   r  )r  r   r  r   r   r   softmax_bwd  s   
r  c                   @   s$   e Zd Zedd Zedd ZdS )SoftmaxFunctionc                 C   s   t |}| | |S r@   )r   save_for_backward)ctxr   r   r   r   r   forward     
zSoftmaxFunction.forwardc                 C   s   | j \}t||}|S r@   )saved_tensorsr  )r  r  r   r  r   r   r   backward  r  zSoftmaxFunction.backwardN)r   r   r   staticmethodr  r  r   r   r   r   r    s
    
r  c                 C   s
   t | S )zSoftmax forward pass with automatic differentiation support.

    Args:
        x: Input tensor of shape (M, N)

    Returns:
        Softmax output tensor of same shape as x
    )r  apply)r   r   r   r   softmax  s   
	r   )2r   typingr   	functoolsr   r   cuda.bindings.driverbindingsdriverr   r   cutlass.cuterS   r   r   r   quack.utilsr   quack.copy_utilsr   quack.compile_utilsr   r   quack.reducer   r	   quack.reduction_baser
   r   r   quack.cute_dsl_utilsr   r   r   library	custom_opr   r   register_faker   r   r   r   r  r  r  autogradFunctionr  r   r   r   r   r   <module>   sF    
 
""
