
    )j"                        d dl mZ d dlmZmZmZ d dlmZ d dl	m
Z
 ddlmZ ddlmZmZmZ ddlmZ e G d d	e                      Z G d
 de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  ZdS )    )	dataclass)AnyDictOptionalN   )swiglu)BaseModelArgscreate_attention_maskscaled_dot_product_attention)	SwitchGLUc                   <   e Zd ZU dZeed<   dZeed<   dZeed<   dZ	eed<   d	Z
eed
<   dZeed<   dZeed<   dZeed<   dZee         ed<   dZee         ed<   dZee         ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZee         ed<   dZeed <   dS )!	ModelArgsdeepseek
model_typei  
vocab_sizei   hidden_sizei +  intermediate_sizei  moe_intermediate_size   num_hidden_layers    num_attention_headsnum_key_value_headsNn_shared_expertsn_routed_expertsnum_experts_per_tokr   moe_layer_freqr   first_k_dense_replacei   max_position_embeddingsgư>rms_norm_epsg     @
rope_thetarope_scalingFattention_bias)__name__
__module____qualname__r   str__annotations__r   intr   r   r   r   r   r   r   r   r   r   r   r   r   r    floatr!   r"   r   r#   bool     `/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/deepseek.pyr   r      sL         J   JK"s"""!%3%%%s!!!!!!!!&*hsm***&*hsm***)-#---NC!"3"""#'S'''L%J#'L(4.''' ND     r-   r   c            	       x     e Zd Zdef fdZ	 	 d	dej        deej                 dee         dej        fdZ	 xZ
S )
DeepseekAttentionconfigc                 Z   t                                                       || _        |j        | _        |j        | _        |j        | _        |j        |j        z  | _        | j        dz  | _        t          |dd          }t          j        | j        |j        | j        z  |          | _        t          j        | j        |j        | j        z  |          | _        t          j        | j        |j        | j        z  |          | _        t          j        | j        |j        | j        z  |          | _        d}|j        rC|j        d         dk    r2t#          |j        d         t$                    sJ d	|j        d         z  }t          j        | j        |j        |
          | _        d S )Ng      r#   Fbiasg      ?typelinearfactorr   )basescale)super__init__r1   r   r   r   num_kv_headshead_dimr9   getattrnnLinearq_projk_projv_projo_projr"   
isinstancer*   RoPEr!   rope)selfr1   r#   
rope_scale	__class__s       r.   r;   zDeepseekAttention.__init__#   s   !-#)#= "6*f.HH]D(
 )95AAi&6
 
 

 i&6
 
 

 i&6
 
 

 i&6
 
 
 
 	;6#6v#>(#J#Jf1(;UCCCCCV0::JGM"
 
 
			r-   Nxmaskcachereturnc                 |   |j         \  }}}|                     |          |                     |          |                     |          }	}}|                    ||| j        d                              dddd          }|                    ||| j        d                              dddd          }|	                    ||| j        d                              dddd          }	|R|                     ||j	                  }|                     ||j	                  }|
                    ||	          \  }}	n*|                     |          }|                     |          }t          |||	|| j        |          }
|
                    dddd                              ||d          }
|                     |
          S )Nr      r      )offset)rM   r9   rL   )shaperA   rB   rC   reshaper   	transposer<   rG   rS   update_and_fetchr   r9   rD   )rH   rK   rL   rM   BL_querieskeysvaluesoutputs              r.   __call__zDeepseekAttention.__call__M   s    '1a $AAAv//!Q(@"EEOOq!Q
 
 ||Aq$"3R88BB1aANN1d&7<<FFq!QPQRRiii==G99T%,977D 11$??LD&&ii((G99T??D-T6djt
 
 
 !!!Q1--55aB??{{6"""r-   NNr$   r%   r&   r   r;   mxarrayr   r   r_   __classcell__rJ   s   @r.   r0   r0   "   s        (
y (
 (
 (
 (
 (
 (
Z $(#	# #8# rx # }	#
 
# # # # # # # #r-   r0   c                   n     e Zd Z	 	 d	dedee         dee         f fdZdej        dej        fdZ	 xZ
S )
DeepseekMLPNr1   r   r   c                 r   t                                                       || _        |p|j        | _        |p|j        | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _	        d S NFr3   )
r:   r;   r1   r   r   r?   r@   	gate_projup_proj	down_proj)rH   r1   r   r   rJ   s       r.   r;   zDeepseekMLP.__init__m   s     	&<&*<!2!Nf6N4#3T5KRWXXXy!143IPUVVV4#94;KRWXXXr-   rK   rN   c                     |                      t          |                     |          |                     |                              S N)rl   r   rj   rk   )rH   rK   s     r.   r_   zDeepseekMLP.__call__{   s4    ~~fT^^A%6%6QHHIIIr-   r`   )r$   r%   r&   r   r   r)   r;   rb   rc   r_   rd   re   s   @r.   rg   rg   l   s         &*+/	Y YY c]Y $C=	Y Y Y Y Y YJ"( Jrx J J J J J J J Jr-   rg   c                   *     e Zd Zdef fdZd Z xZS )MoEGater1   c                     t                                                       || _        |j        | _        |j        | _        t          j        | j        |j        f          | _	        d S rn   )
r:   r;   r1   r   top_kr   rb   zerosr   weightrH   r1   rJ   s     r.   r;   zMoEGate.__init__   sU    /
 & 7h 5v7IJKKr-   c                     || j         j        z  }t          j        |dd          }| j        }t          j        t          j        | |dz
  d          dd |f                   }t          j        ||d          }||fS )NrP   T)axispreciser   )kthrw   .rw   )rt   Trb   softmaxrr   stop_gradientargpartitiontake_along_axis)rH   rK   gatesscoreskindss         r.   r_   zMoEGate.__call__   s    DKM!ED999JQU L L LSRTSTRTW UVV#FDr:::V|r-   r$   r%   r&   r   r;   r_   rd   re   s   @r.   rp   rp      sZ        Ly L L L L L L      r-   rp   c                   *     e Zd Zdef fdZd Z xZS )DeepseekMoEr1   c                 $   t                                                       || _        t          |j        |j        |j                  | _        t          |          | _	        |j
        '|j        |j
        z  }t          ||          | _        d S d S )N)r1   r   )r:   r;   r1   r   r   r   r   
switch_mlprp   gater   rg   shared_experts)rH   r1   r   rJ   s      r.   r;   zDeepseekMoE.__init__   s    # <f>U
 
 FOO	". & <v?V V"-1B# # #D /.r-   c                     |                      |          \  }}|                     ||          }||d         z                      d          }| j        j        ||                     |          z   }|S )N).Nrz   )r   r   sumr1   r   r   )rH   rK   r   r   ys        r.   r_   zDeepseekMoE.__call__   sp    yy||fOOAt$$	""''R'00;'3D''***Ar-   r   re   s   @r.   r   r      sS        y            r-   r   c            	       |     e Zd Zdedef fdZ	 	 d
dej        deej                 dee	         dej        fd	Z
 xZS )DeepseekDecoderLayerr1   	layer_idxc                    t                                                       t          |          | _        |j        (||j        k    r||j        z  dk    rt          |          nt          |          | _	        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nr   eps)r:   r;   r0   	self_attnr   r   r   r   rg   mlpr?   RMSNormr   r    input_layernormpost_attention_layernorm)rH   r1   r   rJ   s      r.   r;   zDeepseekDecoderLayer.__init__   s    *622 '3!=== 55::	  V$$ 	  "z&*<&BUVVV(*
F$7)
 )
 )
%%%r-   NrK   rL   rM   rN   c                     |                      |                     |          ||          }||z   }|                     |                     |                    }||z   }|S rn   )r   r   r   r   )rH   rK   rL   rM   rhouts          r.   r_   zDeepseekDecoderLayer.__call__   s]     NN4//22D%@@EHHT2215566!e
r-   r`   )r$   r%   r&   r   r)   r;   rb   rc   r   r   r_   rd   re   s   @r.   r   r      s        
y 
S 
 
 
 
 
 
( $(#	
 
8
 rx 
 }	

 

 
 
 
 
 
 
 
r-   r   c                   \     e Zd Zdef fdZ	 ddej        dee         dej        fdZ	 xZ
S )	DeepseekModelr1   c                 4   t                                                       | _        t          j        j        j                  | _        fdt          j	                  D             | _
        t          j        j        j                  | _        d S )Nc                 0    g | ]}t          |          S r,   )r   ).0idxr1   s     r.   
<listcomp>z*DeepseekModel.__init__.<locals>.<listcomp>   s1     
 
 
25 --
 
 
r-   r   )r:   r;   r1   r?   	Embeddingr   r   embed_tokensranger   layersr   r    normru   s    `r.   r;   zDeepseekModel.__init__   s    L):F<NOO
 
 
 
9>v?W9X9X
 
 
 Jv1v7JKKK			r-   NrK   rM   rN   c                    |                      |          }|d gt          | j                  z  }t          ||d                   }t	          | j        |          D ]\  }} ||||          }|                     |          S )Nr   )r   lenr   r
   zipr   )rH   rK   rM   r   rL   layercs          r.   r_   zDeepseekModel.__call__   s    
 a  =FS---E$Qa11DK// 	" 	"HE1aq!!AAyy||r-   rn   ra   re   s   @r.   r   r      s        Ly L L L L L L  $ 8 } 
	       r-   r   c                   j     e Zd Zdef fdZ	 d	dej        dee         fdZ	d Z
ed             Z xZS )
Modelr1   c                     t                                                       || _        |j        | _        t	          |          | _        t          j        |j        |j	        d          | _
        d S ri   )r:   r;   argsr   r   modelr?   r@   r   r   lm_headru   s     r.   r;   zModel.__init__   s[    	 +"6**
y!3V5FUSSSr-   NinputsrM   c                 X    |                      ||          }|                     |          S rn   )r   r   )rH   r   rM   r   s       r.   r_   zModel.__call__   s)    
 jj''||C   r-   c           
      
   t          | j        j                  D ]d}d| dD ]ZdD ]U d d v rGfdt          | j        j                  D             }t	          j        |           d d <   V[eS )Nzmodel.layers.)rj   rl   rk   )rt   scalesbiasesz.mlp.experts.0..c                 P    g | ]"}                      d | d d           #S )z.mlp.experts.r   )pop)r   er   mprefixweightss     r.   r   z"Model.sanitize.<locals>.<listcomp>   sS     # # # ! $KK6(K(K(K(KA(K(K(K(KLL# # #r-   z.mlp.switch_mlp.)r   r   r   r   rb   stack)rH   r   lto_joinr   r   r   s    `  @@@r.   sanitizezModel.sanitize   s    ty233 		X 		XA(Q((F: X X7 X XA 8888Q88GCC# # # # # # #%*49+E%F%F# # # GIhwFWFW6 B B1 B Bq B BCXX r-   c                     | j         j        S rn   )r   r   )rH   s    r.   r   zModel.layers  s    z  r-   rn   )r$   r%   r&   r   r;   rb   rc   r   r   r_   r   propertyr   rd   re   s   @r.   r   r      s        Ty T T T T T T  $! !! }! ! ! !   ! ! X! ! ! ! !r-   r   )dataclassesr   typingr   r   r   mlx.corecorerb   mlx.nnr?   activationsr   r8   r	   r
   r   switch_layersr   r   Moduler0   rg   rp   r   r   r   r   r,   r-   r.   <module>r      s    ! ! ! ! ! ! & & & & & & & & & &                   T T T T T T T T T T $ $ $ $ $ $ ! ! ! ! ! ! ! !*G# G# G# G# G#	 G# G# G#TJ J J J J") J J J&    bi   "    ")   2    29   >    BI   8! ! ! ! !BI ! ! ! ! !r-   