o
    ~ri3                     @   s  d dl Z d dlZd dlmZ d dlZd dlmZmZ ddlmZ ddl	m
Z
 ddlmZ e
eZedG d	d
 d
ejZeZedG dd dejZedG dd dejZedG dd dejZedG dd dejZedG dd dejZG dd dejZG dd dejZG dd dejZG d d! d!ejZG d"d# d#ejZG d$d% d%ejZG d&d' d'eZG d(d) d)ejZi d*ed+ed,d-d.fd/ed0ed1ed2d3ifd4ed5ed6d3ifd7ed8ej d9ed:ej!d;ed<ed=ed>ej"d?ed@ej#ej$eej%ej&ej'edAZ(ee(Z)dBdC Z*e*d1Z+e*d0Z,e*d*Z-e*d/Z.e*d=Z/e*dDZ0e*d<Z1e*d;Z2dS )E    N)OrderedDict)Tensornn   )use_kernel_forward_from_hub)logging)is_torchdynamo_compilingGeluTanhc                       L   e Zd ZdZddef fddZdedefdd	Zdedefd
dZ  Z	S )GELUTanha&  
    A fast C implementation of the tanh approximation of the GeLU activation function. See
    https://huggingface.co/papers/1606.08415.

    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
    match due to rounding errors.
    Fuse_gelu_tanh_pythonc                    s2   t    |r| j| _d S tjtjjdd| _d S )Ntanh)approximate)	super__init___gelu_tanh_pythonact	functoolspartialr   
functionalgelu)selfr   	__class__ f/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/transformers/activations.pyr   (   s   
zGELUTanh.__init__inputreturnc                 C   s6   |d dt tdtj |dt |d     S N      ?      ?       @Hm?g      @torchr   mathsqrtpipowr   r   r   r   r   r   /      6zGELUTanh._gelu_tanh_pythonc                 C   
   |  |S Nr   r)   r   r   r   forward2      
zGELUTanh.forwardF)
__name__
__module____qualname____doc__boolr   r   r   r.   __classcell__r   r   r   r   r      s
    r   NewGELUc                   @   "   e Zd ZdZdedefddZdS )NewGELUActivationz
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    r   r   c                 C   s6   d| dt tdtj |dt |d     S r   r#   r)   r   r   r   r.   A   r*   zNewGELUActivation.forwardNr1   r2   r3   r4   r   r.   r   r   r   r   r9   :   s    r9   GeLUc                       r
   )GELUActivationa  
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    Fuse_gelu_pythonc                    s(   t    |r| j| _d S tjj| _d S r,   )r   r   _gelu_pythonr   r   r   r   )r   r=   r   r   r   r   N   s   
zGELUActivation.__init__r   r   c                 C   s    |d dt |td   S )Nr   r    r!   )r$   erfr%   r&   r)   r   r   r   r>   U   s    zGELUActivation._gelu_pythonc                 C   r+   r,   r-   r)   r   r   r   r.   X   r/   zGELUActivation.forwardr0   )
r1   r2   r3   r4   r5   r   r   r>   r.   r6   r   r   r   r   r<   E   s
    r<   SiLUc                   @   r8   )SiLUActivationa  
    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
    later.
    r   r   c                 C   s   t j|S r,   )r   r   silur)   r   r   r   r.   f   s   zSiLUActivation.forwardNr:   r   r   r   r   rA   \   s    rA   FastGELUc                   @   r8   )FastGELUActivationz}
    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 C   s*   d| dt |d dd| |     S )Nr   r    g3E?r"   )r$   r   r)   r   r   r   r.   p   s   *zFastGELUActivation.forwardNr:   r   r   r   r   rD   j       rD   	QuickGELUc                   @   r8   )QuickGELUActivationzr
    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 C   s   |t d|  S )NgZd;?)r$   sigmoidr)   r   r   r   r.   z   s   zQuickGELUActivation.forwardNr:   r   r   r   r   rG   t   rE   rG   c                       s<   e Zd ZdZdedef fddZdedefdd	Z  ZS )
ClippedGELUActivationa  
    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
    https://huggingface.co/papers/2004.09602.

    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
    initially created.

    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://huggingface.co/papers/1606.08415
    minmaxc                    s8   ||krt d| d| dt   || _|| _d S )Nzmin should be < max (got min: z, max: ))
ValueErrorr   r   rJ   rK   )r   rJ   rK   r   r   r   r      s
   

zClippedGELUActivation.__init__xr   c                 C   s   t t|| j| jS r,   )r$   clipr   rJ   rK   )r   rN   r   r   r   r.         zClippedGELUActivation.forward)	r1   r2   r3   r4   floatr   r   r.   r6   r   r   r   r   rI   ~   s    rI   c                       s2   e Zd ZdZ fddZdedefddZ  ZS )AccurateGELUActivationz
    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
    https://github.com/hendrycks/GELUs

    Implemented along with MEGA (Moving Average Equipped Gated Attention)
    c                    s    t    tdtj | _d S )N   )r   r   r%   r&   r'   precomputed_constantr   r   r   r   r      s   
zAccurateGELUActivation.__init__r   r   c                 C   s,   d| dt | j|dt |d     S )Nr   r   r"      )r$   r   rT   r(   r)   r   r   r   r.      s   ,zAccurateGELUActivation.forward)r1   r2   r3   r4   r   r   r.   r6   r   r   r   r   rR      s    rR   c                       sD   e Zd ZdZ fddZdedefddZdedefdd	Z  ZS )
MishActivationz
    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://huggingface.co/papers/1908.08681). Also
    visit the official repository for the paper: https://github.com/digantamisra98/Mish
    c                    s   t    tjj| _d S r,   )r   r   r   r   mishr   rU   r   r   r   r      s   
zMishActivation.__init__r   r   c                 C   s   |t tj| S r,   )r$   r   r   r   softplusr)   r   r   r   _mish_python   rP   zMishActivation._mish_pythonc                 C   r+   r,   r-   r)   r   r   r   r.      r/   zMishActivation.forward)	r1   r2   r3   r4   r   r   rZ   r.   r6   r   r   r   r   rW      s
    rW   c                   @   r8   )LinearActivationz[
    Applies the linear activation function, i.e. forwarding input directly to output.
    r   r   c                 C   s   |S r,   r   r)   r   r   r   r.      s   zLinearActivation.forwardNr:   r   r   r   r   r[      s    r[   c                   @   s   e Zd ZdZdddZdS )LaplaceActivationz
    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
    https://huggingface.co/papers/2209.10655

    Inspired by squared relu, but with bounded range and gradient for better stability
    绹۞? ^/?c                 C   s*   ||  |td }ddt|  S )Nr!   r   r    )divr%   r&   r$   r?   )r   r   musigmar   r   r   r.      s   zLaplaceActivation.forwardN)r]   r^   r1   r2   r3   r4   r.   r   r   r   r   r\      s    r\   c                   @   s   e Zd ZdZdd ZdS )ReLUSquaredActivationz^
    Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668
    c                 C   s   t j|}t|}|S r,   )r   r   relur$   square)r   r   relu_appliedsquaredr   r   r   r.      s   
zReLUSquaredActivation.forwardNrb   r   r   r   r   rc      s    rc   c                       s   e Zd Z fddZ  ZS )ClassInstantierc                    s4   t  |}t|tr|n|i f\}}|di |S )Nr   )r   __getitem__
isinstancetuple)r   keycontentclskwargsr   r   r   ri      s   zClassInstantier.__getitem__)r1   r2   r3   ri   r6   r   r   r   r   rh      s    rh   c                       sf   e Zd ZdZddddejdf fdd	Zded	efd
dZded	efddZ	ded	efddZ
  ZS )XIELUActivationz
    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010

    If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
    Otherwise, we emit a single warning and use xIELU Python
    g?r   gưFc              
      sp  t    ttttj||dd| _	ttttj|| |dd| _
| dtj||d | dtj||d || _t|| _t|| _d | _zFdd l}tjj | _d}zddlm}	 |	| j| _|d7 }W n ty }
 z|d|
 d	7 }| j| _W Y d }
~
nd }
~
ww t| W d S  ty }
 ztd
|
 d W Y d }
~
d S d }
~
ww )N)dtyper   betaepszUsing experimental xIELU CUDA.)allow_in_graphz& Enabled torch._dynamo for xIELU CUDA.z+ Could not enable torch._dynamo for xIELU (z*) - this may result in slower performance.z CUDA-fused xIELU not available (u   ) – falling back to a Python version.
For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`)r   r   r   	Parameterr$   logexpm1tensor	unsqueezealpha_palpha_nregister_bufferwith_vector_loadsrQ   _beta_scalar_eps_scalar_xielu_cuda_obj	xielu.opsclassesxieluXIELUtorch.compilerrt   _xielu_cuda_xielu_cuda_fn	Exceptionloggerwarning_once)r   alpha_p_initalpha_n_initrr   rs   rq   r}   r   msgrt   errr   r   r   r      s>   
	("


zXIELUActivation.__init__rN   r   c              
   C   sh   t j| j}| jt j| j }t|dk|| | | j|  tt	|| j
| | | j|  S )Nr   )r   r   rY   rz   rr   r{   r$   whererw   rJ   rs   )r   rN   rz   r{   r   r   r   _xielu_python  s   $zXIELUActivation._xielu_pythonc                 C   s   |j }| dk r|d}| dk s	| dkr$|dd|d}||j kr1td||j  | j|| j	
|j| j
|j| j| j| j}||S )zDFirewall function to prevent torch.compile from seeing .item() callsrV   r   r   z_Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).)shapedimry   viewsizer   r   r   r.   rz   torq   r{   r~   r   r}   )r   rN   original_shaperesultr   r   r   r     s*   


	zXIELUActivation._xielu_cudar   c                 C   s4   | j d ur|jrt s| |S td | |S )Nz:torch._dynamo is compiling, using Python version of xIELU.)r   is_cudar   r   r   r   r   r)   r   r   r   r.   4  s
   


zXIELUActivation.forward)r1   r2   r3   r4   r$   bfloat16r   r   r   r   r.   r6   r   r   r   r   rp      s    	*	rp   r   gelu_10i
   )rJ   rK   	gelu_fastgelu_newgelu_pythonr=   Tgelu_pytorch_tanhgelu_python_tanhr   gelu_accurate	hardswishlaplace
leaky_relulinearrX   
quick_gelurd   relu2relu6)rH   rB   swishr   prelur   c                 C   s,   | t v rt |  S td|  dtt   )Nz	function z not found in ACT2FN mapping )ACT2FNKeyErrorlistkeys)activation_stringr   r   r   get_activationY  s   r   rB   )3r   r%   collectionsr   r$   r   r   integrations.hub_kernelsr   utilsr   utils.import_utilsr   
get_loggerr1   r   Moduler   PytorchGELUTanhr9   r<   rA   rD   rG   rI   rR   rW   r[   r\   rc   rh   rp   	Hardswish	LeakyReLUReLUReLU6Sigmoidr@   TanhPReLUACT2CLSr   r   r   r   r   r   r   rB   rX   
linear_actr   r   r   r   <module>   s   

			]	
