o
    3/i+\                     @   s`  d dl Z d dlmZ d dlmZmZ d dlZd dlm  m	Z
 d dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
l m!Z! G dd dZ"ej#j$dddhddej%de&de'dej%dej%ddfddZ(e(j)dej%de&de'dej%dej%ddfddZ*edd Z+d0dej%de&de'fddZ,G dd deZ-ej#j$d d!hdd"ej%deej% dej%de&de'd!ej%ddfd#d$Z.e.j)d"ej%deej% dej%de&de'd!ej%ddfd%d&Z/ed'd( Z0	d0d"ej%deej% dej%d)e&de'dej%fd*d+Z1G d,d- d-ej2j3Z4d0dej%de&de'fd.d/Z5dS )1    N)partial)TypeOptional)Int32Float32
const_expr)make_fake_tensor)ReductionBase)
row_reduce)	jit_cache)torch2cute_dtype_map)bitonic_topkc                   @   s   e Zd Zddeej dededefddZdd	 Z	d
d Z
ejdejdejdejdejfddZejdejdejdejdejdejdeje fddZdS )TopKFdtypeNksoftmaxc                 C   s|   || _ || _d|j | _|| _|| _|dtt| ks!J d|dtt| ks0J d|dks6J |dks<J d S )N      zN must be a power of 2i   )	r   r   widthvecsizer   r   intmathlog2selfr   r   r   r    r   W/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/topk.py__init__   s   zTopK.__init__c                 C   s&   | j }tt|| j d|d d}|S )N    @      )r   maxminr   )r   r   num_threads_per_rowr   r   r   _threads_per_row%   s   zTopK._threads_per_rowc           	      C   sp   | j }| j}|dkrdnd}|  }|| }tt|d| |}||| | f}tj| j|||d}|||fS )N @  r      num_copy_elems)	r   r   r%   cuteceil_divr#   
copy_utilstiled_copy_2dr   	r   r   r   num_threadsthreads_per_rowcols_per_blocknum_blocks_Ntiler_mn
tiled_copyr   r   r   _get_tiled_copy,   s   

zTopK._get_tiled_copymXmValuesmIndicesstreamc           	      C   s   |j | jksJ |j | jksJ |j tksJ |  \}}}|j}| ||||||jt|j	d |d ddg|ddg|d d S )Nr   r!   gridblockr9   )
element_typer   r   r5   sizekernellaunchr*   r+   shape)	r   r6   r7   r8   r9   r4   r3   r0   r/   r   r   r   __call__9   s   
zTopK.__call__r3   r4   r0   c           1         sf  t j \}}}t j \ }}|j}	|j}
t |
} fdd||fD \}}||}||}||d }t 	|}t
|
d d k}|rMd ntj|||
d d}ttj|d}|d d |
d k ro||| t |jt}|| t tt| j}d|> d }t
t |	jd }t |t}tjt |dd	D ]*}t|||  d ||  }|| dkr| n|}||@ }|| | @ |B ||< qt
| rt|||jj   t!|| j"|d
}t
t#| j"|d|jj$ }| j"| dksJ t
t %| j"|| }t jj&| } | d> t jj&d B }!t ||ft}"tjt %| j"|dd	D ]Q}|| || k}#tj|dd	D ]>}$t
|dkrp|| |$ | j"k rnt jj'||| |$  d|!d}%|#rn|%|"|$|| f< qA||| |$  |"|$|| f< qAq0t |"t}&t |&jt}'tjt |&dd	D ]&}|&| |@ }|&| | @ |&|< || dkr| n|}t||@ |'|< qt
| j(r6tjt j|"dgddd	D ]&}|| ||  }(|(| j"| krtj|dd	D ]}$tj  |"|$|f< qqt jj'|d d|!d})ttj)}*t jj*|" |* |)|*  dd}+t jj+|+j,t j-j.ddd|d},|"|+t j/|,  t 	|"|j}-|-|" |j |d d }.d dks\|.|
d k rt 0||.d f |f}/t 0||.d f |f}0tjt |-jdgdd	D ]1}|| ||  }(|(| j"| k rt 1|-d |f |/d |(f  t 1|'d |f |0d |(f  qd S d S )Nc                       g | ]}t | d fqS r   r*   
local_tile.0mTbidxr3   r   r   
<listcomp>]       zTopK.kernel.<locals>.<listcomp>)r   NNNr!   limitpredr   Tunroll_full)
warp_widthr      )offsetmask_and_clamp)mode)fastmathg        )init_valreduction_profile)threads_in_group)2r*   arch
thread_idx	block_idxlayout_tv_tiledrA   make_identity_tensor	get_slicepartition_Smake_rmem_tensor_liker   r,   predicate_kr   copymake_rmem_tensorr   storeloadtor   r   r   r   r>   recast_tensorr   cutlassrangeutilsfill_oobr=   infr   r   r#   r   r+   	WARP_SIZEshuffle_syncr   eexp2warp_reduction_sumreduceReductionOpADD
rcp_approxtiled_divideautovec_copy)1r   r6   r7   r8   r3   r4   r0   tidx_	tv_layoutrA   idXgXcXthr_copytXgXtXcXtXrX	is_even_NtXpXrg   tXrX_f32log_Nidx_maskr   tXrX_i32icol_idxencoded_idx	topk_valsvecsize_outnvec_per_threadmaskrX   topk_vals_splitshould_receivevvaltopk_vals_i32topk_indicescolmax_vallog2_eexp_xdenomtopk_vals_outrowmValues_storemIndices_storer   rJ   r   r?   L   s   






 zTopK.kernelNF)__name__
__module____qualname__r   rm   Numericr   boolr   r%   r5   r*   jitTensorcudaCUstreamrB   r?   Shape	TiledCopy	Constexprr   r   r   r   r      s8    "r   zquack::_topk_fwdvaluesindices)mutates_argsxr   r   returnc                 C   s   |   dks
J d| jsJ d| jtjtjtjfv s J d|dkr+|| jd ks/J d| d}t	| j }t
||||| || dS )	a  Top-k forward pass.
    Args:
        x: Input tensor of shape (M, N)
        k: Number of top elements to return
        softmax: Whether to apply softmax to the top-k values
    Returns:
        Tuple of (values tensor of shape (M, k), indices tensor of shape (M, k))
    r   zInput must be 2DzTensor must be on CUDA deviceUnsupported dtyper   r!   zk must be positive and <= NN)dimis_cudar   torchfloat16bfloat16float32rA   r>   r   _compile_topk_fwd)r   r   r   r   r   r   r   r   r   r   	_topk_fwd   s   

r   c           
      C   s|   ddl m} t| dtjpt|tj}|r:|s<| d}t| j }t| j }	t|||| t	|||	||| d S d S d S Nr   )COMPILE_ONLYr!   )
quack.cache_utilsr   
isinstancer>   r   SymIntr   r   r   _compile_topk_bwd)
r   r   r   r   r   r   
has_symintr   r   dx_dtyper   r   r   _topk_fwd_fake   s   


r   c           
      C   sz   t  }td| j |}t| ||f|}t| ||f|}tt||f|}t| |||d}	t j|	|||t j	j
ddddS Nr   r   T)use_tvm_ffi_env_streamz--enable-tvm-ffi)options)r*   sym_intr   gcdr   fake_tensorr   r   compileruntimemake_fake_stream)
r   r   r   r   	batch_symdivx_cutevalues_cuteindices_cutetopk_opr   r   r   r      s   r   Fc                 C   sR   |  d}tj||f| j| jd}tj||ftj| jd}t| |||| ||fS )  Top-k operation.

    Args:
        x: Input tensor of shape (M, N)
        k: Number of top elements to return
        softmax: Whether to apply softmax to the top-k values

    Returns:
        Tuple of (values tensor of shape (M, k), indices tensor of shape (M, k))
    r   r   device)r>   r   emptyr   r   int32r   )r   r   r   Mr   r   r   r   r   topk_fwd  s
   
r   c                       s   e Zd Zddeej dededef fddZdd	 Z	ddede
e fddZejdejde
ej dejdejdejf
ddZejdejde
ej dejdejdejdejdeje fddZ  ZS )TopKBackwardFr   r   r   r   c                    sH   t  j||dtd || _|| _|| _|| _||ksJ |dks"J d S )Nr!   )stagereduction_dtypei   )superr   r   r   r   r   r   r   	__class__r   r   r   %  s   zTopKBackward.__init__c                 C   s   | j dkrdS dS )Nr&   r   r'   )r   )r   r   r   r   _num_threads.  s   zTopKBackward._num_threadsNr   c           	      C   s   |d u rt |d| jj }|| dksJ d| d| |  }t || |}|| }t|| |}||| | f}tj| j|||d}|||fS )Nr   r   zInput N z! is not divisible by vector size r(   )r#   r   r   r   r*   r+   r,   r-   r.   r   r   r   r5   1  s    

zTopKBackward._get_tiled_copymdValuesr7   r8   mdXr9   c              	   C   s   |j | jksJ t|d ur|j | jksJ |j tksJ |   ttdd ||||fD  }t| jd| }| j	| j|d\}}	}
|j
}| |||||	||
jt|jd |	d ddg|ddg|d d S )Nc                 s   s     | ]}|d ur|j jV  qd S N)r=   r   )rH   tr   r   r   	<genexpr>O  s    z(TopKBackward.__call__.<locals>.<genexpr>r   )r   r   r!   r:   )r=   r   r   r   _set_cluster_nr"   r   r   r   r5   r>   r?   r@   r*   r+   rA   )r   r   r7   r8   r   r9   largest_dtype_widthr   r4   r3   r0   r/   r   r   r   rB   ?  s6   	
zTopKBackward.__call__r3   r4   r0   c           2   
      s  t j \}}	}	t j \ }	}	|j}
|j}t |}t |j} fdd||fD \}} fdd||||fD \}}}}tj	 }|j
|jt jdddd}| ||
\}}||}||}t|d urq||nd }||}t |}t|d urt |nd }t |}||jj t|d ur||jj |d ||}||} ||d	 }!t | }"t|d
 d
 k}#tj|||jd
 d}$|#rd ntj|||d
 d}%ttj|$d}&ttj|%d}'|!d d }(tt j d d  })tj|d |jjd |(|d k r-|&|| t|d ur(|&|| |&|| t j  | t}*t| jrZ| t}+t |*|+ t j!j"||d },|+|*|,  }-n|*}-t #|j|j}.|.$|-|j |(|d k rtj%|jd d
 ddD ]?}/tj%|jd ddD ]1}0|$|/d|0f rtj%|jd d ddD ]}1|.|1|/fd|0f ||(|) ||1|/fd|0f f< qqqt j  t &||" |(|d k r|'|"|  d S d S )Nc                    rC   rD   rE   rG   rJ   r   r   rL   v  rM   z'TopKBackward.kernel.<locals>.<listcomp>c                    s*   g | ]}|d urt | dfnd qS )Nr   rE   rG   rJ   r   r   rL   w  s    )r!   r   )order   )byte_alignmentr   rN   r!   rO   rQ   )
fill_value)NNr   TrS   r   )'r*   r^   r_   r`   ra   rA   rb   rm   ro   SmemAllocatorallocate_tensorr=   make_ordered_layout#_allocate_reduction_buffer_and_mbarrc   rd   r   re   fillzeropartition_Dr,   rf   r   rg   r   rp   barrierrj   rk   r   r   r
   rx   ry   rh   ri   rn   r|   )2r   r   r7   r8   r   r3   r4   r0   r}   r~   r   rA   r   idTopKgdXr   gdValsgValsgIdxcTopKsmemsdXreduction_buffermbar_ptrr   tXgdVtXgVtXgItXrdVtXrVtXrItXsdXtXgdXr   tXrdXr   tXpVr   copy_kcopy_dxr   tile_row_start	dvals_f32vals_f32dotgradsgrad_cvtrest_vnr   r   rJ   r   r?   c  s   















"
zTopKBackward.kernelr   r   )r   r   r   r   rm   r   r   r   r   r   r   r5   r*   r   r   r   r   rB   r?   r   r   r   __classcell__r   r   r   r   r   $  s@    &	#r   zquack::_topk_bwddxdvaluesc           
      C   s   |   dks
J d|dur|  dksJ d|  dks"J d| jr(|js,J d| jtjtjtjfv s;J d|d}t| j }|durNt|j nd}t|j }	t	|||	|||| ||| dS )	ai  Top-k backward pass.
    Args:
        dvalues: Upstream gradients tensor of shape (M, k)
        values: Forward top-k values tensor of shape (M, k)
        indices: Indices tensor of shape (M, k) from forward pass
        k: Number of top elements
        softmax: Whether softmax was applied in forward
        dx: Output gradient tensor of shape (M, N)
    r   zdvalues must be 2DNzvalues must be 2Dzindices must be 2DzTensors must be on CUDA devicer   r!   )
r   r   r   r   r   r   r   r>   r   r   )
r  r   r   r   r   r  r   r   	val_dtyper   r   r   r   	_topk_bwd  s   


 r  c                 C   st   ddl m} |r6t|dtjs8|d}t| j }|d ur$t|j nd }	t|j }
t||	|
||| d S d S d S r   )	r   r   r   r>   r   r   r   r   r   )r  r   r   r   r   r  r   r   r   r  r   r   r   r   _topk_bwd_fake  s   



r  c              	   C   s   t  }td| j |}t| ||f|}|d ur!t|||f|nd }	tt||f|}
t|||f|}t| |||d}t j|||	|
|t j	j
ddddS r   )r*   r   r   r   r   r   r   r   r   r   r   )r   r  r   r   r   r   r   r   dvalues_cuter   r   dx_cutetopk_bwd_opr   r   r   r     s    r   r   c                 C   s8   | j \}}tj||f| j| jd}t| ||||| |S )a  Top-k backward pass.

    Args:
        dvalues: Upstream gradients tensor of shape (M, k)
        values: Forward top-k values tensor of shape (M, k), required if softmax=True
        indices: Indices tensor of shape (M, k) from forward pass
        N: Size of the original input dimension
        softmax: Whether softmax was applied in forward

    Returns:
        Input gradients tensor of shape (M, N)
    r   )rA   r   zerosr   r   r  )r  r   r   r   r   r   r   r  r   r   r   topk_bwd  s   
r   c                   @   sJ   e Zd ZeddejdedefddZeddejd	e	ej fd
dZ
dS )TopKFunctionFr   r   r   c                 C   sZ   t |||d\}}| |r|nd | || _|jd | _|| _| | | d ||fS )Nr   r!   F)r   save_for_backwardr   rA   r   r   mark_non_differentiableset_materialize_grads)ctxr   r   r   r   r   r   r   r   forward-  s   

zTopKFunction.forwardNr  	dindices_c                 C   s*   | j \}}t|||| j| jd}|d d fS )N)r   r   )saved_tensorsr   r   r   )r%  r  r'  r   r   r  r   r   r   backward8  s   

zTopKFunction.backwardr   r   )r   r   r   staticmethodr   r   r   r   r&  r   r)  r   r   r   r   r!  ,  s
    
"r!  c                 C   s   t | ||S )r   )r!  apply)r   r   r   r   r   r   topk?  s   r,  r   )6r   	functoolsr   typingr   r   r   cuda.bindings.driverbindingsdriverr   rm   cutlass.cuter*   r   r   r   quack.utilsro   quack.copy_utilsr,   quack.compile_utilsr   r   quack.reduction_baser	   quack.reducer
   r   r   quack.cute_dsl_utilsr   quack.sort.bitonic_sortr   r   library	custom_opr   r   r   r   register_faker   r   r   r   r  r  r   r   autogradFunctionr!  r,  r   r   r   r   <module>   s    B
 (

