o
    ~ri]                     @   sZ  d dl mZ d dlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ e	 r)d dlZeeZ		d8d	ejd
ejdejdB dedejf
ddZdejjdejdejdejdejf
ddZd	ejd
ejdejdejfddZd	ejd
ejdejdejfddZdd Zdd Ze	 rejjded d! ejde ejjdeed" d	ejd
ejdejdefd#d$Zd	ejd
ejdejdejfd%d&Z		d8d	ejd
ejdejdejdB dedejfd'd(Z dejjdejdejdejdejf
d)d*Z!G d+d, d,eZ"e" Z#d-ejdejfd.d/Z$	d9e#ddd0d1d2e%ejj dB d3e"ded4ed5ede%ejj fd6d7Z&dS ):    )Callable)wraps   )logging)GeneralInterface)is_torch_availableis_torch_less_or_equalis_torchdynamo_compilingNFinputweightbiasis_transposedreturnc                 C   sJ   |rt | d|d}nt || dd}|dur#|| }|S )a  Batched linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (batch_size, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (batch_size, output_dim, input_dim) if transposed is `False`,
            else of shape (batch_size, input_dim, output_dim).
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (batch_size, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (batch_size, output_dim).
       N)torchbmm	unsqueezesqueeze)r
   r   r   r   out r   k/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/transformers/integrations/moe.py_batched_linearE   s   r   selfhidden_statestop_k_indextop_k_weightsc                 C   sb  |j }|d}|d}|d}tj||ddd|d}|d}	|d}
|
| jk}|
d| jd }
|| }| j	rR| j
|
 }| jrO| j|
 nd }n| j|
 }| jr_| j|
 nd }t|||| jd}| j	rs| |}n| |}| j|
 }| jr| j|
 nd }t|||| jd}||	d }||dd ||||jdd}||jS )Nr   r   devicer   r   r   g        dim)r   sizer   aranger   expandreshapenum_expertsclamphas_gategate_up_projhas_biasgate_up_proj_biasup_projup_proj_biasr   r   _apply_gateact_fn	down_projdown_proj_biasmasked_fill_viewsumtodtype)r   r   r   r   r   	num_top_k
num_tokens
hidden_dim	token_idxsample_weights
expert_idsinvalid_maskselected_hidden_statesselected_weightsselected_biasesproj_outweighted_outfinal_hidden_statesr   r   r   batched_mm_experts_forwardg   s<   


"








rD   offsc                 C   sr   t j| d|d| j| jd}d}t| D ]\}}||kr"qt j| || || ||| d |}q|S )a(  
    Fallback grouped matrix multiplication used when `torch.nn.functional.grouped_mm` and `torch._grouped_mm`
    are unavailable or incompatible with `torch.compile` (e.g. non-bfloat16 weights).

    Args:
        input (`torch.Tensor`): Input of shape (S, input_dim), sorted by expert id.
        weight (`torch.Tensor`): Expert weights of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`): Cumulative token counts per expert of shape (num_experts,).
    Returns:
        `torch.Tensor`: Output of shape (S, output_dim).
    r   r   r   r6   r   )r   zerosr"   r   r6   	enumeratetolistmm)r
   r   rE   outputstartiendr   r   r   _grouped_mm_fallback   s   "$rP   c                 C   s  |   dksJ dt| j |  dks J dt|j |  dks0J dt|j |d|dksJJ d|d d	|d | d|dksdJ d
| d d|d |jtjtjfv suJ d|j tj| d|d| j	| jdS )zRShape/dtype inference stub for `_grouped_mm_fallback` required by `torch.compile`.r   z+input must be 2D (S, input_dim), got shape    zBweight must be 3D (num_experts, input_dim, output_dim), got shape r   z*offs must be 1D (num_experts,), got shape r   zoffs length z must match number of experts zinput_dim mismatch: input has z, weight has z$offs must be an integer tensor, got rF   )
r!   tupleshaper"   r6   r   int32int64emptyr   r
   r   rE   r   r   r   _grouped_mm_fallback_fake   s     4""rX   c                 C   s"   |  |d |d  |d | _dS )zjSaves input and weight for backward; offs is stored directly as it is a non-differentiable integer tensor.r   r   r   N)save_for_backwardrE   )ctxinputsrL   r   r   r   "_grouped_mm_fallback_setup_context   s   r\   c           	      C   s   | j \}}t|}t|}d}t| j D ]1\}}||kr!qtj||| || j||| d tj||| j||| || d |}q||dfS )zuBackward pass for `_grouped_mm_fallback`. Computes grad_input and grad_weight per expert group; offs has no gradient.r   rG   N)saved_tensorsr   
zeros_likerI   rE   rJ   rK   T)	rZ   grad_outputr
   r   
grad_inputgrad_weightrM   rN   rO   r   r   r   _grouped_mm_fallback_backward   s   


&&
rc   z!transformers::grouped_mm_fallbackr   )mutates_args)setup_contextc                 C   sf   t  r	|jtjks%|jjdkr'tdddr'| d dks%|  d dkr'dS ttj	j
dp2ttd	S )
a  
    Check if torch.nn.functional.grouped_mm or torch._grouped_mm can be used based on availability and compatibility with torch.compile.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `bool`: True if grouped_mm can be used, False otherwise.
    cpuz2.10.0T)
accept_dev   r   F
grouped_mm_grouped_mm)r	   r6   r   bfloat16r   typer   data_ptrhasattrnn
functionalrW   r   r   r   _can_use_grouped_mm   s   
 rq   c                 C   sl   t | ||r,ttjjdrtjjj| |j||dS ttdr,tj| |j||dS tj	j
j| ||dS )a  Grouped matrix multiplication dispatcher that uses torch.nn.functional.grouped_mm if available, else falls back to torch._grouped_mm.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    ri   rE   rj   )rq   rn   r   ro   rp   ri   r5   r6   rj   opstransformersgrouped_mm_fallbackrW   r   r   r   rj     s   
rj   c                 C   s>   |r
t | ||d}nt | |dd|d}|dur|| }|S )a  Grouped linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim) if `is_transposed`,
            else of shape (num_experts, output_dim, input_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (num_experts, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    rr   r   N)rj   	transpose)r
   r   rE   r   r   r   r   r   r   _grouped_linear8  s   rx   c                 C   s  |j }|d}|d}|d}tj||ddd|d}|d}	|d}
|| }t|
}t|}tj|d|d||< |
| }|	| }|| }|j	dkr\|
 n| }tj|| jd| jd d}tj|dtjd}| jr| j}| jr| j| nd }n| j}| jr| j| nd }t||||| jd}| jr| |}n| |}| j}| jr| j| nd }t||||| jd}||d }|| }||||jdd	}||jS )
Nr   r   r   r   rf   )binsminmax)r!   r6   r   r    ) r   r"   r   r#   r   r$   r%   argsort
empty_likerl   floatinthistcr&   cumsumrT   r(   r)   r*   r+   r,   r-   rx   r   r.   r/   r0   r1   r3   r4   r5   r6   )r   r   r   r   r   r7   r8   r9   r:   r;   r<   r>   perminv_permexpert_ids_gsample_weights_gselected_hidden_states_ghistc_inputtokens_per_expertoffsetsr?   r@   rA   rB   rC   r   r   r   grouped_mm_experts_forward^  sJ   


"




r   c                       s8   e Zd ZdZeedZdededef fddZ	  Z
S )ExpertsInterfacez;Interface for registering custom experts forward functions.)
batched_mmri   experts_implementationdefaultr   c                    sB   |du r
t d n|dkr|| vrtd| dt ||S )zfReturn the requested `experts_implementation`. Also strictly check its validity, and raise if invalid.Na
  You tried to access the `ExpertsInterface` with a `config._experts_implementation` set to `None`. This is expected if you use an Expert Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._experts_implementation`eager`zL` is not a valid experts implementation registered in the `ExpertsInterface`)loggerwarning_onceKeyErrorsuperget)r   r   r   	__class__r   r   get_interface  s   
zExpertsInterface.get_interface)__name__
__module____qualname____doc__rD   r   _global_mappingstrr   r   __classcell__r   r   r   r   r     s    "r   gate_up_outc                 C   s    |j ddd\}}| || S )a  
    Default gating mechanism: splits the gate_up_out into gate and up parts,
    applies the activation function to the gate part, and multiplies it with the up part.
    Args:
        gate_up_out (`torch.Tensor`):
            The output tensor from the gate and up projection of shape (S, 2 * intermediate_dim).
    Returns:
        `torch.Tensor`: The gated output tensor of shape (S, intermediate_dim).
    r   r   r    )chunkr/   )r   r   gateupr   r   r   _default_apply_gate  s   
r   T)experts_interfacer   r*   r(   experts_classr   r*   r(   c                   s@   dt tjj dt tjj f fdd}| dur|| S |S )a  Decorator to modify experts class to support different experts implementations.

    Args:
        experts_class (`type[torch.nn.Module]`, *optional*):
            The experts class to modify. If not provided, returns a decorator that can be applied to the class.
        experts_interface (`ExpertsInterface`, *optional*, defaults to `ALL_EXPERTS_FUNCTIONS`):
            The experts interface to use for dispatching the forward method.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the expert weights are stored in transposed format.
        has_bias (`bool`, *optional*, defaults to `False`):
            Whether the expert layers include bias terms.

    Returns:
        `type[torch.nn.Module]`: The modified experts class.
    r   r   c                    s\   | j | j tfdd}t  fdd}t| ds&t| _|| _ || _| S )Nc                    s4   | |g|R i | || _ | _ | _| _d S N)configr(   r*   r   )r   r   argskwargs)r*   r(   r   original_initr   r   __init__  s
   
z=use_experts_implementation.<locals>.wrapper.<locals>.__init__c                    s&     | jj}|| g|R i |S r   )r   r   _experts_implementation)r   r   r   experts_forward)r   original_forwardr   r   forward  s   z<use_experts_implementation.<locals>.wrapper.<locals>.forwardr.   )r   r   r   rn   r   r.   )r   r   r   r   r*   r(   r   )r   r   r   wrapper  s   
z+use_experts_implementation.<locals>.wrapperN)rl   r   ro   Module)r   r   r   r*   r(   r   r   r   r   use_experts_implementation  s   ,r   )NFr   )'collections.abcr   	functoolsr   utilsr   utils.genericr   utils.import_utilsr   r   r	   r   
get_loggerr   r   Tensorboolr   ro   r   rD   rP   rX   r\   rc   library	custom_opregister_fakeregister_autogradrq   rj   rx   r   r   ALL_EXPERTS_FUNCTIONSr   rl   r   r   r   r   r   <module>   s   
-
"
"G" 
#
&
U
