
    #j/c                        d dl mZ d dlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZ  e	            rd d	lZ ej        e          Z	 	 d/dej        dej        dej        d	z  dedej        f
dZdej        j        dej        dej        dej        dej        f
dZdej        dej        dej        dej        fdZdej        dej        dej        dej        fdZd Zd Z e	            rUej                            ded           ej                            de           ej                             dee           dej        dej        dej        defdZ!dej        dej        dej        dej        fd Z"	 	 d/dej        dej        dej        dej        d	z  dedej        fd!Z#dej        j        dej        dej        dej        dej        f
d"Z$ G d# d$e          Z% e%            Z&d%ej        dej        fd&Z'	 d0e&d'd
d
d'd(d)e(ej        j                 d	z  d*e%d+eded,ed-ede(ej        j                 fd.Z)d	S )1    )Callable)wraps   )logging)GeneralInterface)is_torch_availableis_torch_greater_or_equalis_torch_less_or_equalis_torchdynamo_compiling   )sonicmoe_experts_forwardNFinputweightbiasis_transposedreturnc                    |r<t          j        |                     d          |                              d          }n;t          j        ||                     d                                        d          }|||z   }|S )a  Batched linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (batch_size, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (batch_size, output_dim, input_dim) if transposed is `False`,
            else of shape (batch_size, input_dim, output_dim).
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (batch_size, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (batch_size, output_dim).
    r   )torchbmm	unsqueezesqueeze)r   r   r   r   outs        g/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/transformers/integrations/moe.py_batched_linearr   L   s{    *  Ai**F33;;A>> i 3 344<<R@@DjJ    selfhidden_statestop_k_indextop_k_weightsc                    |j         }|                    d          }|                    d          }|                    d          }t          j        ||                              d                              d|                              d          }|                    d          }	|                    d          }
|
| j        k    }|
                    d| j        dz
            }
||         }| j	        r$| j
        |
         }| j        r| j        |
         nd }n#| j        |
         }| j        r| j        |
         nd }t          |||| j                  }| j	        r|                     |          }n|                     |          }| j        |
         }| j        r| j        |
         nd }t          |||| j                  }||	                    d          z  }|                    |                    d          d           |                    |||                              d          }|                    |j                  S )Nr   r   devicer   r   r           dim)r#   sizer   aranger   expandreshapenum_expertsclamphas_gategate_up_projhas_biasgate_up_proj_biasup_projup_proj_biasr   r   _apply_gateact_fn	down_projdown_proj_biasmasked_fill_viewsumtodtype)r   r   r   r    r#   	num_top_k
num_tokens
hidden_dim	token_idxsample_weights
expert_idsinvalid_maskselected_hidden_statesselected_weightsselected_biasesproj_outweighted_outfinal_hidden_statess                     r   batched_mm_experts_forwardrJ   n   si    !F  $$I##A&&J##B''J Z777AA!DDKKBPYZZbbceffI"**2..N$$R((J !11L!!!T%5%9::J +95 } S,Z8@DW$0<<SW<
3;?=R$+J77d  0VZVh  H
 } )##H-- ;;x(( ~j19=Pd)*55DO "HZ  H
 n66r:::Ll44R88#>>> '++J	:NNRRWXRYY!!-"5666r   offsc                 T   t          j        |                     d          |                    d          | j        | j                  }d}t          |                                          D ];\  }}||k    rt          j        | ||         ||         |||                    |}<|S )a(  
    Fallback grouped matrix multiplication used when `torch.nn.functional.grouped_mm` and `torch._grouped_mm`
    are unavailable or incompatible with `torch.compile` (e.g. non-bfloat16 weights).

    Args:
        input (`torch.Tensor`): Input of shape (S, input_dim), sorted by expert id.
        weight (`torch.Tensor`): Expert weights of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`): Cumulative token counts per expert of shape (num_experts,).
    Returns:
        `torch.Tensor`: Output of shape (S, output_dim).
    r   r   r#   r<   r   )r   zerosr(   r#   r<   	enumeratetolistmm)r   r   rK   outputstartiends          r   _grouped_mm_fallbackrW      s     [AAu|SXS^___FE DKKMM**  3C<<uSy!6!9&s2CDDDDMr   c                    |                                  dk    sJ dt          | j                               |                                 dk    sJ dt          |j                               |                                 dk    sJ dt          |j                               |                    d          |                    d          k    s6J d|                    d           d	|                    d                       |                     d          |                    d          k    s6J d
|                     d           d|                    d                       |j        t
          j        t
          j        fv sJ d|j                     t          j        |                     d          |                    d          | j	        | j                  S )zRShape/dtype inference stub for `_grouped_mm_fallback` required by `torch.compile`.r   z+input must be 2D (S, input_dim), got shape    zBweight must be 3D (num_experts, input_dim, output_dim), got shape r   z*offs must be 1D (num_experts,), got shape r   zoffs length z must match number of experts zinput_dim mismatch: input has z, weight has z$offs must be an integer tensor, got rM   )
r'   tupleshaper(   r<   r   int32int64emptyr#   r   r   rK   s      r   _grouped_mm_fallback_faker`      s   99;;!_5QVQ\K]K]__::<<1bUSYS_M`M`bb  88::???\tzIZIZ\\???99Q<<6;;q>>)))+v$))A,,+v+vflfqfqrsftft+v+v)))::a==FKKNN***UAUUV[[QR^^UU +** :%+u{33335h\`\f5h5h333;uzz!}}fkk!nnU\QVQ\]]]]r   c                 d    |                      |d         |d                    |d         | _        dS )zjSaves input and weight for backward; offs is stored directly as it is a non-differentiable integer tensor.r   r   r   N)save_for_backwardrK   )ctxinputsrS   s      r   "_grouped_mm_fallback_setup_contextre      s/    &)VAY///ayCHHHr   c                    | j         \  }}t          j        |          }t          j        |          }d}t          | j                                                  D ]r\  }}||k    rt          j        |||         ||         j        |||                    t          j        |||         j        |||         ||                    |}s||dfS )zuBackward pass for `_grouped_mm_fallback`. Computes grad_input and grad_weight per expert group; offs has no gradient.r   rN   N)saved_tensorsr   
zeros_likerP   rK   rQ   rR   T)	rc   grad_outputr   r   
grad_inputgrad_weightrT   rU   rV   s	            r   _grouped_mm_fallback_backwardrm      s    %ME6!%((J"6**KE CHOO--..  3C<<U3Y'*U3Y:OPPPPuSy!#[s%;QPPPP{D((r   z!transformers::grouped_mm_fallback )mutates_args)setup_contextc                     t                      r|j        t          j        k    sW|j        j        dk    rIt          dd          r8|                                dz  dk    s|                                 dz  dk    rdS |j        j        dk    rt          t          j	        j
        d	          r(t          j                            |j                  d
k    S t          t          d          rat          dd          r(t          j                            |j                  d
k    S t          j                            |j                  dk    S dS t          t          j	        j
        d	          pt          t          d          S )a  
    Check if torch.nn.functional.grouped_mm or torch._grouped_mm can be used based on availability and compatibility with torch.compile.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `bool`: True if grouped_mm can be used, False otherwise.
    cpuz2.10.0T)
accept_dev   r   Fcuda
grouped_mm)   r   _grouped_mmz2.9)	   r   )r   r<   r   bfloat16r#   typer
   data_ptrhasattrnn
functionalru   get_device_capabilityr	   r_   s      r   _can_use_grouped_mmr     sT    	!"" v|u~'E'Ee##"8=== 	$ __#q((ENN,<,<r,AQ,F,F u
 }V##58&55 	M:33FMBBfLL5-(( 	Q(4@@@ Qz77FF&PPz77FF&PPu58&55V9V9VVr   c                    t          | ||          rt          t          j        j        d          r?t          j        j                            |                     |j                  ||          S t          t          d          r/t          j        |                     |j                  ||          S t          j	        j
                            | ||          S )a  Grouped matrix multiplication dispatcher that uses torch.nn.functional.grouped_mm if available, else falls back to torch._grouped_mm.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    rv   rK   rx   )r   r}   r   r~   r   rv   r;   r<   rx   opstransformersgrouped_mm_fallbackr_   s      r   rx   rx   .  s    $ 5&$// P
 58&55 	P8&11%((6<2H2H&W[1\\\UM** 	P$UXXfl%;%;V$OOOO9!55eV$5OOOr   c                     |rt          | ||          }n&t          | |                    dd          |          }|||z   }|S )a  Grouped linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim) if `is_transposed`,
            else of shape (num_experts, output_dim, input_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (num_experts, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    r   r   )rx   	transpose)r   r   rK   r   r   r   s         r   _grouped_linearr   M  s]    0  F%d333 %!1!1"b!9!9EEEDjJr   c                 \   |j         }|                    d          }|                    d          }|                    d          }t          j        ||                              d                              d|                              d          }|                    d          }	|                    d          }
|
| j        k    }|
                    d| j        dz
            }
t          j	        |
          }t          j
        |          }t          j        |                    d          |          ||<   |
|         }|	|         }|||                  }|j        dk    r|                                n|                                }t          j        || j        d| j        dz
            }t          j        |dt          j                  }| j        r| j        }| j        r| j        |         nd }n| j        }| j        r| j        |         nd }t/          ||||| j                  }| j        r|                     |          }n|                     |          }| j        }| j        r| j        |         nd }t/          ||||| j                  }||                    d          z  }||         }|                    |                    d          d	           ||         }|                    |||                              d
          }|                     |j!                  S )Nr   r   r"   r   rr   )binsminmax)r'   r<   r$   r%   r&   )"r#   r(   r   r)   r   r*   r+   r,   r-   argsort
empty_liker{   floatinthistccumsumr\   r.   r/   r0   r1   r2   r3   r   r   r4   r5   r6   r7   r8   r9   r:   r;   r<   )r   r   r   r    r#   r=   r>   r?   r@   rA   rB   rC   perminv_permexpert_ids_gsample_weights_gselected_hidden_states_ghistc_inputtokens_per_expertoffsetsrE   rF   rG   rH   invalid_mask_grI   s                             r   grouped_mm_experts_forwardr   s  s@    !F  $$I##A&&J##B''J Z777AA!DDKKBPYZZbbceffI"**2..N$$R((J !11L!!!T%5%9::J =$$D%%H\$))A,,v>>>HTNd#L%d+,Yt_=
 +1+*>*>,$$&&&LDTDTDVDVKKd6FASWScfgSghhhl,!5;GGGG } U,BF-Y$0>>UY<=A]T$+L99PT  "2G/aeas  H
 } )##H-- ;;x(( ~;?=Rd),77dO "G/QUQc  H
 .88<<<L!$'Nn66r::C@@@  )L '++J	:NNRRWXRYY!!-"5666r   c                   <     e Zd ZdZeeedZdede	de	f fdZ
 xZS )ExpertsInterfacez;Interface for registering custom experts forward functions.)sonicmoe
batched_mmrv   experts_implementationdefaultr   c                     |t                               d           n|dk    r|| vrt          d| d          t                                          ||          S )zfReturn the requested `experts_implementation`. Also strictly check its validity, and raise if invalid.Na
  You tried to access the `ExpertsInterface` with a `config._experts_implementation` set to `None`. This is expected if you use an Expert Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._experts_implementation`eager`zL` is not a valid experts implementation registered in the `ExpertsInterface`)loggerwarning_onceKeyErrorsuperget)r   r   r   	__class__s      r   get_interfacezExpertsInterface.get_interface  s    !)N   
 $w..3IQU3U3Ux*xxx   ww{{17;;;r   )__name__
__module____qualname____doc__r   rJ   r   _global_mappingstrr   r   __classcell__)r   s   @r   r   r     so        EE -00 O<C <( <x < < < < < < < < < <r   r   gate_up_outc                 f    |                     dd          \  }}|                     |          |z  S )a  
    Default gating mechanism: splits the gate_up_out into gate and up parts,
    applies the activation function to the gate part, and multiplies it with the up part.
    Args:
        gate_up_out (`torch.Tensor`):
            The output tensor from the gate and up projection of shape (S, 2 * intermediate_dim).
    Returns:
        `torch.Tensor`: The gated output tensor of shape (S, intermediate_dim).
    r   r   r&   )chunkr5   )r   r   gateups       r   _default_apply_gater     s7        ++HD";;tr!!r   T)experts_interfaceis_concatenatedr   r0   r.   experts_classr   r   r0   r.   c                    dt           t          j        j                 dt           t          j        j                 ffd}|  ||           S |S )a  Decorator to modify experts class to support different experts implementations.

    Args:
        experts_class (`type[torch.nn.Module]`, *optional*):
            The experts class to modify. If not provided, returns a decorator that can be applied to the class.
        experts_interface (`ExpertsInterface`, *optional*, defaults to `ALL_EXPERTS_FUNCTIONS`):
            The experts interface to use for dispatching the forward method.
        is_concatenated (`bool`, *optional*, defaults to `True`):
            Whether the expert weights are stored in concatenated layout [gate;up]
            or interleaved layout [gate0, up0, gate1, up1, ...].
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the expert weights are stored in transposed format.
        has_bias (`bool`, *optional*, defaults to `False`):
            Whether the expert layers include bias terms or not.
        has_gate (`bool`, *optional*, defaults to `True`):
            Whether the experts use a gating mechanism or not.
            Whether it has gate_up_proj weights or just up_proj weights.

    Returns:
        `type[torch.nn.Module]`: The modified experts class.
    r   r   c                     | j         | j        t                    	fd            }t                    fd            }t          | d          st          | _        || _         || _        | S )Nc                 h     | |g|R i | || _         | _        | _        | _        | _        d S N)configr.   r0   r   r   )	r   r   argskwargsr0   r.   r   r   original_inits	       r   __init__z=use_experts_implementation.<locals>.wrapper.<locals>.__init__  sP    M$8888888 DK$DM$DM!.D#2D   r   c                 \                         | j        j                  } || g|R i |S r   )r   r   _experts_implementation)r   r   r   experts_forwardr   original_forwards       r   forwardz<use_experts_implementation.<locals>.wrapper.<locals>.forward   s>    /==dk>acsttO"?49$999&999r   r4   )r   r   r   r}   r   r4   )
r   r   r   r   r   r   r0   r.   r   r   s
      @@r   wrapperz+use_experts_implementation.<locals>.wrapper  s    %.(0	}			3 	3 	3 	3 	3 	3 	3 	3 
		3 
	 	 	: 	: 	: 	: 	: 
!	 	: }m44 	<(;M%!) 'r   )r{   r   r~   Module)r   r   r   r   r0   r.   r   s    ````` r   use_experts_implementationr     sv    >tEHO4 eho9N          2  w}%%%Nr   )NFr   )*collections.abcr   	functoolsr   utilsr   utils.genericr   utils.import_utilsr   r	   r
   r   r   r   r   
get_loggerr   r   Tensorboolr   r~   r   rJ   rW   r`   re   rm   library	custom_opregister_fakeregister_autogradr   rx   r   r   r   ALL_EXPERTS_FUNCTIONSr   r{   r   rn   r   r   <module>r      s   % $ $ $ $ $             , , , , , ,            / . . . . .  LLL 
	H	%	%\ !%	 <L ,
 	
 \   DA7
(/A7<A7 A7 <	A7
 \A7 A7 A7 A7N el %, [`[g    4^U\ ^5< ^u| ^`e`l ^ ^ ^ ^  ) ) )&  	M?AUdfggg	M CE^___	M##+%8 $   *Wu| *WU\ *W *WZ^ *W *W *W *WZP<PLP ,P \	P P P PF !%# #<#L# ,# ,
	#
 # \# # # #LU7
(/U7<U7 U7 <	U7
 \U7 U7 U7 U7p< < < < <' < < <0 )(** "5< "EL " " " " 37; +@ ; ; ;(4/; (; 	;
 ; ; ; 
%(/; ; ; ; ; ;r   