o
    3/if                     @   s.  d dl Z d dlmZ d dlmZmZmZ d dlZd dlmZ d dl	m
  mZ d dlZd dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZm Z  d dl!m"Z" d d	l#m$Z$ d d
l%m&Z& G dd de"Z'e$dd Z(ej)j*dh dd	d;dededee dedee dee de+ddfddZ,e,j-	d;dededee dedee dee de+ddfddZ.					d<dejdejdeej de+d e/d!e/d"e/deje0ej B fd#d$Z1G d%d& d&Z2e$d'd( Z3	d;dejdejd)ejdejdejddfd*d+Z4ej)j*d,dhd	d;dejdejd)ejdejdejde+ddfd-d.Z5e5j-	d;dejdejd)ejdejdejde+ddfd/d0Z6		d=dejdejd)ejdejde+d"e/ddfd1d2Z7G d3d4 d4ej8j9Z:			5	d>dejdejd6eej de+d7ed8 d"e/dejfd9d:Z;dS )?    N)partial)OptionalTypeLiteral)Tensor)Int32Int64Float32Boolean
const_expr)make_fake_tensor)
row_reduceonline_softmax_reduce)ReductionBase)	jit_cache)torch2cute_dtype_mapc                       s   e Zd Zddeej dedef fddZdd Z	d	d
 Z
ejdejdejdeej dejdeej deej dedejfddZejdejdejdejdejdeej deej dedejdejdeje fddZ  ZS )CrossEntropyTdtypeNonline_softmaxc                    sP   || _ t j||| j sdnd| j stntd |dks| j r#d | _d S d| _d S )N      )stagereduction_dtype @  smem)r   super__init__r	   r   reload_from)selfr   r   r   	__class__ `/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/cross_entropy.pyr      s   "zCrossEntropy.__init__c                 C   s(   | j }dD ]\}}||kr|  S qdS )N)@      )      )i       )i   r%   )r   r'      )r   r   r   limitthreadsr"   r"   r#   _threads_per_row&   s   zCrossEntropy._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr(   )r   r   )i   r   )      )   r&   )r/   )r0   r   )r2   r1   )i   r&   )r   r   r   width	cluster_n)r   r   
thresholdsr,   clusterr"   r"   r#   _set_cluster_n-   s   

zCrossEntropy._set_cluster_nmXmTargetmTargetLogitmLossmLSEmdXignore_indexstreamc	                 C   s   |j | jksJ t|d u r|}t|d ur|j | jksJ |   t|j j}	t|d ur7tt|	|j j}	t| jd|	 }
| j	|
d\}}}|j
}| ||||||||||
jt|jd |d | jdg|ddgt| jdkryd| jdgnd |d d S )Nr'   vecsizer   r   )gridblockr6   r?   )element_typer   r   r7   r3   maxmathgcdr   _get_tiled_copysizekernellaunchcuteceil_divshaper4   )r   r8   r9   r:   r;   r<   r=   r>   r?   largest_dtype_widthrA   
tiled_copytiler_mnthreads_per_rownum_threadsr"   r"   r#   __call__9   s<   
zCrossEntropy.__call__rQ   rP   rR   c           2   
      s  t j \}}}t j \ }}t| jdkrtdnt j d |	j}|j}t |} fdd||fD \}}t	j
 }|j|jt jdddd}| ||\}}|	|}||}||}||d	 }t |}t|d d | j k}|rd ntj|||d d
}ttj|d}t |	t jj }| ||| |d d }tj} ||d k rt|| } ||d k r|||dd t j  t jd t| rt
|||jj   t !|| |" #t$}!t$j}"t%| |k}#||d k r0|d d dkr0|#s0tt &|jdkrt$||| f }"nt &|jdks*J t$|| }"t| j' rt(|!t j)j*|
|d t| jdkrL|d nd t$j  t| jdkr\t jj+nd d}$t| j,dkrut !|| |" #t$}!t-.t-j/}%t j-j0|!|% |$|%  dd}&t(|&t j)j1|
|d t| jdkr|d nd dd}'nt2|!|
|d |t| jdkrt jj+nd t|d ud\}$}'}&|d d dkr||d k r| jdkst j3 dkr|$t j-j4|'dd }(|#s|(|" nt$j})||)||< t|d ur|(||< t|d ur|'dks!|'|'ks!|#s!t j5|'nt$j}*|&|* }+t 6| f},||,}-t |-}.||}/t |t$}0|07|+ |#sqt	j8t |ddD ]}1|/|1 d | krg|0|1 n|0|1 d |0|1< qX|.7|0" #|.j ||d k r||.|- d S d S d S )Nr   r   c                       g | ]}t | fqS r"   rL   
local_tile.0mTbidx	cluster_yrQ   r"   r#   
<listcomp>y       z'CrossEntropy.kernel.<locals>.<listcomp>r   r   orderr(   byte_alignment)r   NNNr,   predTis_asyncr   )NNr   )init_valhook_fnr   Ffastmath)NNr   g        )rk   )rl   return_exp_xunroll_full      ?)9rL   arch
thread_idx	block_idxr   r4   layout_tv_tiledrN   make_identity_tensorcutlassutilsSmemAllocatorallocate_tensorrD   make_ordered_layout#_allocate_reduction_buffer_and_mbar	get_slicepartition_Spartition_Dmake_rmem_tensor_like
copy_utilspredicate_kr   copyrI   	WARP_SIZE_initialize_clusterr   zerocp_async_commit_groupcp_async_wait_groupfill_oobinfautovec_copyloadtor	   r
   rankr   r   ReductionOpMAXcluster_waitr   rF   log2eexp2ADDr   block_idx_in_clusterlog
rcp_approxrW   storerange)2r   r8   r9   r:   r;   r<   r=   r>   rQ   rP   rR   tidx_	tv_layoutrN   idXgXcXr   sXreduction_buffermbar_ptrthr_copytXgXtXsXtXcXtXrX	is_even_NtXpXr   	num_warpsrowtargetxtarget_logitshould_ignoremax_xlog2_eexp_xdenomlseloss_val	denom_invprobsgdXtXgdXtXrdXtXcFull	tXrdX_f32ir"   r[   r#   rJ   c   s   $







&		




.zCrossEntropy.kernel)T)__name__
__module____qualname__r   rx   Numericintboolr   r.   r7   rL   jitr   r   r   cudaCUstreamrT   rJ   Shape	TiledCopy	Constexpr__classcell__r"   r"   r    r#   r      sX    "	)	
r   c                 C   s   t  }td| j |}t| ||f|}	|rt| ||f|nd }
t||f}|d urA|dkr:t||t  f|}n	t||f}nd }tt|f}|rQtt|fnd }t| || d}t j||	|||||
t	dt j
jdddd
S )	Nr'   r   )r   r   Tuse_tvm_ffi_env_stream--enable-tvm-ffioptions)rL   sym_intrF   rG   r3   fake_tensorr	   r   compiler   runtimemake_fake_stream)r   target_dtypetarget_logit_dtyper   has_lsehas_dxtarget_logit_ndim	batch_symdivx_cutedx_cutetarget_cutetarget_logit_cute	loss_cutelse_cutecross_entropy_opr"   r"   r#   _compile_cross_entropy_fwd   s2   r   zquack::cross_entropy_fwd_out>   dxr   loss)mutates_argsr   r   r   r   r   r   r>   returnc              	   C   s4  |   dks
J d|  dksJ d| jr|jsJ d| jtjtjtjfv s-J d|jtjtjfv s:J d|durR|jsEJ d	|jtjtjtjfv sRJ |dur]|js]J d
| 	d}t
| j }t
|j }	|durut
|j nd}
|dur~|jnd}t||	|
||du|du|| |||||t| dS )a>  Cross entropy forward pass.

    Args:
        x: Input logits tensor of shape (M, N)
        target: Target class indices tensor of shape (M,)
        target_logit: (M, K) or (M,).
            If provided, the target logit will be read from this tensor instead of x.
        loss: Output loss tensor of shape (M,)
        lse: Optional output log-sum-exp tensor of shape (M,)
        dx: Optional output gradient tensor of shape (M, N)
        ignore_index: Index to ignore in loss computation

    Returns:
        None (mutates loss, lse, and optionally dx in-place)
    r   Input must be 2Dr   Target must be 1DTensors must be on CUDA deviceUnsupported input dtypeTarget must be int32 or int64Nz$Target logits must be on CUDA devicezdx must be on CUDA device)dimis_cudar   torchfloat16bfloat16float32int32int64rI   r   ndimr   r   )r   r   r   r   r   r   r>   r   r   r   r   r   r"   r"   r#   cross_entropy_fwd_out  s6   


r   c                 C   s   ddl m} |rJt| dtjsL| d}t| j }	t|j }
|d ur)t|j nd }|d ur2|jnd }t	|	|
|||d u|d u| t
|	|
| d S d S d S Nr   )COMPILE_ONLYr   )quack.cache_utilsr   
isinstancerI   r   SymIntr   r   r   r   _compile_cross_entropy_backward)r   r   r   r   r   r   r>   r   r   r   r   r   r   r"   r"   r#   _cross_entropy_fwd_out_fakeK  s&   


	r   F
return_lse	return_dxinplace_backwardc                 C   s   |  d}| j}tj||tjd}	|rtj||tjdnd }
|r)|s't| n| nd }t| |||	|
|| |r>|r>|	|
|fS |rD|	|
fS |rJ|	|fS |	S )Nr   )devicer   )rI   r  r   emptyr   
empty_liker   )r   r   r   r>   r  r  r  Mr  r   r   r   r"   r"   r#   cross_entropy_fwdl  s   
	
r  c                   @   s   e Zd Zdeej defddZdd Zdefdd	Z	e
jd
e
jde
jde
jde
jde
jdedejfddZe
jd
e
jde
jde
jde
jde
jdede
jde
jde
jdeje fddZdS )CrossEntropyBackwardr   r   c                 C   s   || _ || _d|j | _d S )Nr'   )r   r   r3   rA   )r   r   r   r"   r"   r#   r     s   zCrossEntropyBackward.__init__c                 C   s.   t | jd}dD ]\}}||kr|  S qdS )Nr   r$   r*   )minr   r+   r"   r"   r#   r.     s   z%CrossEntropyBackward._threads_per_rowrA   c           	      C   s   | j | dksJ d| j  d| t| j d}|dkrdnd}|  }|| }t|| |}||| | f}tj| j|||d}|||fS )Nr   zInput N z! is not divisible by vector size r   r'   r*   )num_copy_elems)r   r
  r.   rL   rM   r   tiled_copy_2dr   )	r   rA   r   rS   rR   cols_per_blocknum_blocks_NrQ   rP   r"   r"   r#   rH     s   $

z$CrossEntropyBackward._get_tiled_copyr8   r9   mDLossr=   r<   r>   r?   c                    s   |j  jksJ |j  jksJ t jd jj } j|d\}	}
}|	j} fdd|||fD \}}} |||||||j	|
|	|
j
t|j	d |
d t|j	d |
d dg|ddg|d d S )Nr'   r@   c                    s   g | ]}t j|d  jdqS )r   )r   rI   )layout_utilsexpandr   )rY   Xr   r"   r#   r^     s    z1CrossEntropyBackward.__call__.<locals>.<listcomp>r   r   )rB   rC   r?   )rD   r   rF   rG   r   r3   rH   rI   rJ   rN   rK   rL   rM   )r   r8   r9   r  r=   r<   r>   r?   rA   rP   rQ   rR   rS   r"   r  r#   rT     s8   

zCrossEntropyBackward.__call__rN   rQ   rP   rR   c           *         s  t j \}}}t j \ }tj }|j|jt j	dddd}t 
|} fdd|||fD \}}}|	|}||}||}||d }||}||}dd ||fD \}}t|d	 d	  d
k}|rtd ntj|||d	 d}ttj|d}|d
 d
 }||d
 k r|||dd t j  t jd
 t| rt|||jj  t || | t}tj} tj}!tj}"||d
 k rt|| } t| |k}#|#st|| }!t|| }"ttj }$t jj!||$ |"|$  dd}%|%d }&t "|t}'tj#t $|ddD ]}(||( d	 | k|'|(< qt %|' |&|%})|)|! })|&|)|j ||d
 k rD||| d S d S )Nr`   ra   r(   rc   c                    rU   r"   rV   rX   r\   bidyrQ   r"   r#   r^     r_   z/CrossEntropyBackward.kernel.<locals>.<listcomp>re   c                 S   s   g | ]}t |qS r"   )rL   r   )rY   thrr"   r"   r#   r^     s    r   r   rf   rg   Tri   rm   rr   rp   )'rL   rs   rt   ru   rx   ry   rz   r{   rD   r|   rw   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   r   r
   rF   r   r   r   r   r   rI   wherer   )*r   r8   r9   r  r=   r<   r>   rN   rQ   rP   rR   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   dlossr   r   r   r   prob_shiftedmaskr   gradr"   r  r#   rJ     sb   

"






zCrossEntropyBackward.kernelN)r   r   r   r   rx   r   r   r   r.   rH   rL   r   r   r   r   r   rT   rJ   r   r   r   r"   r"   r"   r#   r	    sT    )	
r	  c                 C   s   t  }td| j |}t| ||f|gd \}}t||f}tt|fgd \}}	t| |}
t j|
|||||	t	dt j
jdddd	S )Nr'   r   r   Tr   r   r   )rL   r   rF   rG   r3   r   r	   r	  r   r   r   r   )r   r   r   r   r   r   r   r   
dloss_cuter   cross_entropy_backward_opr"   r"   r#   r     s"   
r   r  c           	      C   s<  |   dks
J d|  dksJ d|  dksJ d|  dks(J d| jd |jd ks6J d| jd |jd ksDJ d| jd |jd ksRJ d| jr^|jr^|jr^|jsbJ d	| jtjtjtjfv sqJ d
|jtjtj	fv s~J d| 
d}t| j }t|j }t|||| ||||t| dS )a<  Cross entropy backward pass.
    Args:
        x: Input logits tensor of shape (M, N)
        target: Target class indices tensor of shape (M,)
        dloss: Upstream gradients tensor of shape (M,)
        lse: Log-sum-exp values tensor of shape (M,)
    Returns:
        Input gradients tensor of shape (M, N)
    r   r   r   r   zdloss must be 1Dzlse must be 1Dr   zBatch dimensions must matchr   r   r   N)r   rN   r   r   r   r   r   r   r   r   rI   r   r   r   )	r   r   r  r   r   r>   r   r   r   r"   r"   r#   _cross_entropy_backward)  s$   



r  zquack::cross_entropy_bwd_outc                 C   s   t | ||||| d S )N)r  r   r   r  r   r   r>   r"   r"   r#   cross_entropy_bwd_outN  s   	r   c           
      C   sX   ddl m} |r(t| dtjs*| d}t| j }t|j }	t||	| d S d S d S r   )	r   r   r   rI   r   r   r   r   r   )
r   r   r  r   r   r>   r   r   r   r   r"   r"   r#   _cross_entropy_bwd_out_fakeZ  s   



r!  c                 C   sL   |rt j s| }t| |||| |d |S t | }t| |||||d |S )Nr  )r   compileris_compilingr  r  r   )r   r   r  r   r>   r  r   r"   r"   r#   cross_entropy_bwdm  s   
r$  c                   @   s&   e Zd ZedddZedd ZdS )	CrossEntropyFunctionNr   Fc                 C   sR   |d u rt |||dd\}}nt ||||dd\}}| ||| || _|| _|S )NT)r>   r  )r   r>   r  )r  save_for_backwardr>   r  )ctxr   r   lse_partialr>   r  r   r   r"   r"   r#   forward  s   

zCrossEntropyFunction.forwardc                 C   s2   | j \}}}t||||| j| jd}|d d d d fS )N)r  )saved_tensorsr$  r>   r  )r'  r  r   r   r   r   r"   r"   r#   backward  s
   zCrossEntropyFunction.backward)Nr   F)r   r   r   staticmethodr)  r+  r"   r"   r"   r#   r%    s
    r%  meanr(  	reduction)noner-  sumc                 C   s^   t | ||||}|dkr| ||k   S |dkr!| S |dkr'|S td| d)a  Cross entropy loss with automatic differentiation support.

    Args:
        x: Input logits tensor of shape (M, N)
        target: Target class indices tensor of shape (M,)
        lse_partial: Optional precomputed log-sum-exp partial results
        reduction: Specifies the reduction to apply to the output:
            'none': no reduction will be applied (default)
            'mean': the sum of the output will be divided by the number of elements
            'sum': the output will be summed
        inplace_backward: Whether to perform backward pass in-place
        ignore_index: Index to ignore in loss computation (loss will be 0 for these indices)

    Returns:
        Cross entropy loss tensor:
            - If reduction='none': tensor of shape (M,) with per-example losses
            - If reduction='mean': scalar tensor with mean loss
            - If reduction='sum': scalar tensor with sum of losses
    r-  r0  r/  zInvalid reduction mode: z*. Expected one of 'none', 'mean', or 'sum')r%  applyr0  float
ValueError)r   r   r(  r>   r.  r  r   r"   r"   r#   cross_entropy  s   
r4  )r   )Nr   FFF)r   F)Nr   r-  F)<rF   	functoolsr   typingr   r   r   r   r   cuda.bindings.driverbindingsdriverr   rx   cutlass.cuterL   r   r   r	   r
   r   quack.utilsry   quack.copy_utilsr   quack.layout_utilsr  quack.compile_utilsr   r   quack.reducer   r   quack.reduction_baser   r   r   quack.cute_dsl_utilsr   r   r   library	custom_opr   r   register_faker   r   tupler  r	  r   r  r   r!  r$  autogradFunctionr%  r4  r"   r"   r"   r#   <module>   sT   [
!4#
 

%
