
    )jC                        d dl mZ d dlmZ d dlmZmZmZmZ d dl	m
Z d dlmZ d dlmZmZmZ ddlmZ ddlmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZ  eej         d          d             Z! G d dej"                  Z#e G d de                      Z$ G d dej"                  Z% G d dej"                  Z&ej         d             Z' G d dej"                  Z( G d dej"                  Z) G d dej"                  Z* G d dej"                  Z+ G d  d!ej"                  Z, G d" d#ej"                  Z-dS )$    )	dataclass)partial)AnyDictListOptionalN)shard_inplaceshard_linearsum_gradients   )swiglu)BaseModelArgscreate_attention_maskscaled_dot_product_attention)KVCacheRotatingKVCache)initialize_rope)SwiGLU	SwitchGLUT)	shapelessc                     t          j        t          j        |          d |          }t          j        | | |          } || z  S )N)a_mina_max)mxclipnnsilu)xgatelimits      _/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/step3p5.pyclamped_swiglur"      sA    7274==E:::D
%u---A!8O    c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZS )ClampedSwiGLUr    c                 V    t                                                       || _        d S N)super__init__r    )selfr    	__class__s     r!   r)   zClampedSwiGLU.__init__   s$    


r#   r   r   returnc                 .    t          ||| j                  S r'   )r"   r    )r*   r   r   s      r!   __call__zClampedSwiGLU.__call__   s    atz222r#   )	__name__
__module____qualname__floatr)   r   arrayr.   __classcell__r+   s   @r!   r%   r%      ss        e      3"( 3"( 3rx 3 3 3 3 3 3 3 3r#   r%   c                      e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed<   d	Zeed
<   dZeed<   dZ	e
e         ed<   dZeed<   dZeed<   dZe
ee                  ed<   dZe
ee                  ed<   dZe
ee                  ed<   dZe
e         ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZe
e         ed <   d!Zeed"<   dZeed#<   dZe
ee                  ed$<   dZe
ee                  ed%<   d&Zeed'<   dS )(	ModelArgs
model_typehidden_sizenum_hidden_layers
vocab_sizenum_attention_headsnum_attention_groupshead_dimintermediate_sizeh㈵>rms_norm_epsg     @
rope_thetaNrope_scalingi   max_position_embeddingsi   sliding_windowlayer_typesyarn_only_typespartial_rotary_factorsattention_other_settingTuse_head_wise_attn_gatei   moe_num_experts   	moe_top_ki   moe_intermediate_sizeshare_expert_dimmoe_layers_enumg      @moe_router_scaling_factornorm_expert_weightswiglu_limitsswiglu_limits_sharedFtie_word_embeddings)r/   r0   r1   str__annotations__intrA   r2   rB   rC   r   r   rD   rE   rF   r   rG   rH   rI   rJ   boolrK   rM   rN   rO   rP   rQ   rR   rS   rT   rU    r#   r!   r7   r7   "   s        OOOOOOMMML%J#'L(4.'''#)S)))NC'+K$s)$++++/OXd3i(///48HT%[1888.2Xd^222$(T(((OSIs!%3%%% c   %)OXc])))'*u***####+/M8DK(///26(4;/666 %%%%%%r#   r7   c                   N     e Zd Zddedef fdZdej        dej        fdZ xZ	S )	ZeroCenteredRMSNormr@   dimsepsc                     t                                                       t          j        |f          | _        || _        d S r'   )r(   r)   r   onesweightr^   )r*   r]   r^   r+   s      r!   r)   zZeroCenteredRMSNorm.__init__C   s7    gtg&&r#   r   r,   c                 X    t           j                            || j        | j                  S r'   )r   fastrms_normra   r^   r*   r   s     r!   r.   zZeroCenteredRMSNorm.__call__H   s     w4;999r#   )r@   )
r/   r0   r1   rX   r2   r)   r   r3   r.   r4   r5   s   @r!   r\   r\   B   sv         S u      
:"( :rx : : : : : : : :r#   r\   c                   T     e Zd Z	 d	dededef fdZdej        dej        fdZ	 xZ
S )

Step3p5MLPr   argsr?   swiglu_limitc                 t   t                                                       |j        | _        || _        t	          j        | j        | j        d          | _        t	          j        | j        | j        d          | _        t	          j        | j        | j        d          | _        |r|dk    r|nd | _	        d S )NFbiasr   )
r(   r)   r9   r?   r   Linear	gate_projup_proj	down_projr    )r*   rh   r?   ri   r+   s       r!   r)   zStep3p5MLP.__init__M   s     	+!24#3T5KRWXXXy!143IPUVVV4#94;KRWXXX%1PlQ6F6F\\D


r#   r   r,   c                 @   | j         O|                     t          |                     |          |                     |          | j                             S |                     t          |                     |          |                     |                              S r'   )r    rp   r"   ro   rn   r   re   s     r!   r.   zStep3p5MLP.__call__Z   s{    :!>>t||Aq0A0A4:NN   ~~fT^^A%6%6QHHIIIr#   )r   )r/   r0   r1   r7   rX   r2   r)   r   r3   r.   r4   r5   s   @r!   rg   rg   L   s        MNQ QQ25QEJQ Q Q Q Q QJ"( Jrx J J J J J J J Jr#   rg   c                 2   t          j        |                     t           j                            }||z   }t          j        | |dz
  d          dd |f         }t          j        ||d          }|r|t          j        |dd          dz   z  }|||z  fS )	Nr   )kthaxis.ru   T)ru   keepdimsg#B;)r   sigmoidastypefloat32argpartitiontake_along_axissum)	gatesrouter_biastop_krouted_scaling_factornorm_topk_probscorescorrected_scorestopk_indicestopk_weightss	            r!   moe_gate_selectr   b   s    ZRZ0011F+?$4#4%!)"MMMVeVL %flDDDL 
#F<b48885@
 (====r#   c                   :     e Zd Zdef fdZdej        fdZ xZS )Step3p5MoEGaterh   c                 2   t                                                       |j        | _        |j        | _        |j        | _        |j        | _	        t          j        |j        | j        d          | _        t          j        | j        f          | _        d S NFrk   )r(   r)   rM   r   rK   n_routed_expertsrQ   r   rR   r   r   rm   r9   r   r   zerosr   r*   rh   r+   s     r!   r)   zStep3p5MoEGate.__init__u   s    ^
 $ 4%)%C""5Id.0EERRR	8T%:$<==r#   r   c                 v    t          |                     |          | j        | j        | j        | j                  S r'   )r   r   r   r   r   r   re   s     r!   r.   zStep3p5MoEGate.__call__   s8    IIaLLJ&
 
 	
r#   )	r/   r0   r1   r7   r)   r   r3   r.   r4   r5   s   @r!   r   r   t   sa        >Y > > > > > >
"( 
 
 
 
 
 
 
 
r#   r   c                   L     e Zd Zdedef fdZdej        dej        fdZ xZ	S )
Step3p5MoErh   	layer_idxc                 
   t                                                       d}|j        r'|t          |j                  k     r|j        |         pd}d}|j        r'|t          |j                  k     r|j        |         pd}t          |          | _        |dk    rt          |          nt                      }t          |j
        |j        |j        |          | _        t          ||j        |          | _        d | _        d S )Nr   )
activationr?   ri   )r(   r)   rS   lenrT   r   r   r%   r   r   r9   rN   rK   
switch_mlprg   rO   share_expertsharding_group)r*   rh   r   ri   swiglu_limit_sharedr   r+   s         r!   r)   zStep3p5MoE.__init__   s    	>)c$2D.E.E"E"E-i8=AL$ 	LS9R5S5S)S)S"&";I"F"K!"4((	4@14D4D]<000&((
#& !	
 
 
 '"3,
 
 
 #r#   r   r,   c                    | j          t          | j                   |          }|                     |          \  }}|                     ||          }||d         z                      d                              |j                  }||                     |          z   }| j         &t          j	        
                    || j                   }|S )N.Nrv   group)r   r   r   r   r}   ry   dtyper   r   distributedall_sum)r*   r   r   r   routed_outputys         r!   r.   zStep3p5MoE.__call__   s    *2d122155A%)YYq\\"l<88\)44SbS\\VM'(( 	
 D--a000*&&q0C&DDAr#   )
r/   r0   r1   r7   rX   r)   r   r3   r.   r4   r5   s   @r!   r   r      sq        #Y #3 # # # # # #:"( rx        r#   r   c            	       |     e Zd Zdedef fdZ	 	 d
dej        deej                 dee	         dej        fd	Z
 xZS )Step3p5Attentionrh   r   c                     t                                                       |j        }|j        pg }|r||         dk    | _        n|dz  dk    | _        | j        r,|j        r%|j        d         | _        |j        d         | _        n|j        | _        |j	        | _        |j
        | _
        | j
        dz  | _        t          j        || j        | j
        z  d          | _        t          j        || j        | j
        z  d          | _        t          j        || j        | j
        z  d          | _        t          j        | j        | j
        z  |d          | _        t%          | j
        |j        	          | _        t%          | j
        |j        	          | _        |j        | _        | j        r!t          j        || j        d          | _        |j        }t3          |t4                    r||         }d
}|j        r%|t9          |j                  k     r|j        |         }t;          | j
        |z            }|j        pg }|r||         nd}	|r|	|vrd }
n|j        }
tA          ||d|
|j!                  | _"        d S )Nsliding_attention   r   r<   r=   g      Frk   r^   g      ?full_attention)r]   basetraditionalscaling_configrD   )#r(   r)   r9   rF   
is_slidingrI   	num_headsnum_kv_headsr<   r=   r>   scaler   rm   q_projk_projv_projo_projr\   rA   q_normk_normrJ   g_projrB   
isinstancelistrH   r   rX   rG   rC   r   rD   rope)r*   rh   r   dimrF   rB   partial_rotary_factor	rope_dimsrG   
layer_typerC   r+   s              r!   r)   zStep3p5Attention.__init__   s   &," 	1))48KKDOO'!mq0DO? 	:t; 	:!9:OPDN $ <=S TD!5DN $ 9D]D(
iT^dm%C%PPPiT%6%FUSSSiT%6%FUSSSi >%PPP)$-T=NOOO)$-T=NOOO'+'C$' 	E)CeDDDDK_
j$'' 	/#I.J #& 	K9s4;V7W7W+W+W$($?	$J!(==>>	.4"/:P[++@P
 	-z@@LL,L#'$($@
 
 
			r#   Nr   maskcacher,   c                 6   |j         \  }}}|                     |          |                     |          |                     |          }	}}|                     |                    ||| j        d                                        dddd          }|                     |                    ||| j	        d                                        dddd          }|	                    ||| j	        d                              dddd          }	|R| 
                    ||j                  }| 
                    ||j                  }|                    ||	          \  }}	n*| 
                    |          }| 
                    |          }t          |||	|| j        |          }
|
                    dddd          }
| j        r0|
t!          j        |                     |                    d         z  }
|                     |
                    ||d                    S )	Nrs   r   r   r      )offset)r   r   r   r   )shaper   r   r   r   reshaper   	transposer   r   r   r   update_and_fetchr   r   rJ   r   rx   r   r   )r*   r   r   r   BL_querieskeysvaluesoutputs              r!   r.   zStep3p5Attention.__call__   s    '1a $AAAv++gooaDNBGGHHRRq!Q
 
 {{4<<1d.?DDEEOOq!Q
 
 1d&7<<FFq!QPQRRiii==G99T%,977D 11$??LD&&ii((G99T??D-T6djt
 
 
 !!!Q1--' 	DbjQ88CCF{{6>>!Q33444r#   NNr/   r0   r1   r7   rX   r)   r   r3   r   r   r.   r4   r5   s   @r!   r   r      s        7
Y 7
3 7
 7
 7
 7
 7
 7
x $(#	"5 "58"5 rx "5 }	"5
 
"5 "5 "5 "5 "5 "5 "5 "5r#   r   c            	       |     e Zd Zdedef fdZ	 	 d
dej        deej                 dee	         dej        fd	Z
 xZS )Step3p5DecoderLayerrh   r   c                    t                                                       t          ||          | _        | j        j        | _        t                      }|j        r7d |j                                                            d          D             }n"t          t          d|j
                            }||v | _        | j        rt          ||          | _        nLd}|j        r'|t          |j                  k     r|j        |         pd}t!          ||j        |          | _        t%          |j        |j                  | _        t%          |j        |j                  | _        d S )Nc                 ,    h | ]}t          |          S rZ   )rX   ).0is     r!   	<setcomp>z/Step3p5DecoderLayer.__init__.<locals>.<setcomp>"  s    VVVc!ffVVVr#   ,r   r   r   r   )r(   r)   r   	self_attnr   setrP   stripsplitranger:   is_moe_layerr   mlprT   r   rg   r?   r\   r9   rA   input_layernormpost_attention_layernorm)r*   rh   r   moe_layers_idxri   r+   s        r!   r)   zStep3p5DecoderLayer.__init__  sk   )$	::.3 	CVVd.B.H.H.J.J.P.PQT.U.UVVVNN q$*@!A!ABBN%7 
	!$	22DHHL( IYT=V9W9W-W-W#8CHq!"&"8)  DH  3$"3 
  
  
 )<$"3)
 )
 )
%%%r#   Nr   r   r   r,   c                     |                      |                     |          ||          }||z   }|                     |                     |                    }||z   S )Nr   r   )r   r   r   r   )r*   r   r   r   rhs         r!   r.   zStep3p5DecoderLayer.__call__;  sZ     NN4//22UNKKEHHT22155661ur#   r   r   r5   s   @r!   r   r     s        
Y 
3 
 
 
 
 
 
H $(#		 	8	 rx 	 }		
 
	 	 	 	 	 	 	 	r#   r   c                   h     e Zd Zdef fdZ	 ddej        deee	                  dej        fdZ
 xZS )	Step3p5Modelrh   c                    t                                                       | _        j        | _        j        | _        t          j        j        j                  | _	        fdt          j                  D             | _        t          j        j                  | _        t          d t!          | j                  D             d           | _        t          d t!          | j                  D             d           | _        d S )Nc                 0    g | ]}t          |          S rZ   )r   )r   r   rh   s     r!   
<listcomp>z)Step3p5Model.__init__.<locals>.<listcomp>O  s3     
 
 
  i00
 
 
r#   r   c              3   .   K   | ]\  }}|j         |V  d S r'   r   r   r   ls      r!   	<genexpr>z(Step3p5Model.__init__.<locals>.<genexpr>V  s-      BB41aQ\BQBBBBBBr#   c              3   .   K   | ]\  }}|j         |V  d S r'   r   r   s      r!   r   z(Step3p5Model.__init__.<locals>.<genexpr>Y  s-      FF41aFQFFFFFFr#   )r(   r)   rh   r;   r:   
num_layersr   	Embeddingr9   embed_tokensr   layersr\   rA   normnext	enumerate_swa_idx	_full_idxr   s    `r!   r)   zStep3p5Model.__init__H  s    	/0L$:JKK
 
 
 
"4#9::
 
 
 ((8d>OPPP	BB9T[11BBBD
 
 FF9T[11FFF
 
r#   Nr   r   r,   c                    |                      |          }|d g| j        z  }d }d }| j        t          ||| j                           }| j        't          ||| j                 | j        j                  }t          | j        |          D ]\  }}|j	        r|n|} ||||          }| 
                    |          S )N)window_sizer   )r   r   r   r   r   rh   rE   zipr   r   r   )	r*   r   r   r   	full_maskswa_masklayercr   s	            r!   r.   zStep3p5Model.__call__\  s    
 a  =FT_,E	>%-at~1FGGI=$,5'TY5M  H DK// 	- 	-HE1$/>88YDad!,,,AAyy||r#   r'   )r/   r0   r1   r7   r)   r   r3   r   r   r   r.   r4   r5   s   @r!   r   r   G  s        
Y 
 
 
 
 
 
. &* 8 S	" 
	       r#   r   c                        e Zd Zdef fdZ	 ddej        deee	                  fdZ
ed             Zd Zd	 Zed
             Zed             Zddeej        j                 fdZ xZS )Modelrh   c                     t                                                       || _        |j        | _        t	          |          | _        t          j        |j        |j	        d          | _
        d S r   )r(   r)   rh   r8   r   modelr   rm   r9   r;   lm_headr   s     r!   r)   zModel.__init__y  sY    	/!$''
y!14?OOOr#   Ninputsr   c                 X    |                      ||          }|                     |          S r'   )r  r  )r*   r	  r   outs       r!   r.   zModel.__call__  s)    
 jj''||C   r#   c                     | j         j        S r'   )r  r   r*   s    r!   r   zModel.layers  s    z  r#   c                 *      fd j         D             S )Nc                 n    g | ]1}|j         rt          j        j                   nt	                      2S ))max_size)r   r   rh   rE   r   )r   r  r*   s     r!   r   z$Model.make_cache.<locals>.<listcomp>  sO     
 
 
  #)ABBBBYY	
 
 
r#   )r   r  s   `r!   
make_cachezModel.make_cache  s2    
 
 
 
 
 
 
 	
r#   c                   	 g d	t          	fd|D                       }i }|                                D ]\  }}d|v r
d|v rf|                    d          }t          |          dk    r>|d                                         r$t          |d                   | j        j        k    rt	D ]%\  }}||v r||vr|                    ||          } n&|r|	                    d          r	d|v r|d	z   }|||<   |S )
N))z.moe.gate_proj.z.mlp.switch_mlp.gate_proj.)z.moe.up_proj.z.mlp.switch_mlp.up_proj.)z.moe.down_proj.z.mlp.switch_mlp.down_proj.)z
.moe.gate.z.mlp.gate.gate.)z.moe.router_biasz.mlp.gate.router_bias)z.share_expert.z.mlp.share_expert.c              3   8   K   | ]}D ]\  }}||v o||vV  d S r'   rZ   )r   ksrcdst
remappingss       r!   r   z!Model.sanitize.<locals>.<genexpr>  sY       
 
*+z
 
;C3C1H%A
 
 
 
 
 
 
r#   z.mtpzmodel.layers..r   z.weightr   r   )
anyitemsr   r   isdigitrX   rh   r:   replaceendswith)
r*   weights
is_vanillanew_weightsr  vpartsr  r  r  s
            @r!   sanitizezModel.sanitize  sM   
 
 

  
 
 
 
/6
 
 
 
 

 MMOO 	 	DAq{{!##u::>>eAh&6&6&8&8>58}}	(CCC &  S!881		#s++AE ajj33 !EKNNr#   c                     d }|S )Nc                 
    d| vS )Nr   rZ   )r  s    r!   	predicatez'Model.cast_predicate.<locals>.predicate  s     ))r#   rZ   r*   r&  s     r!   cast_predicatezModel.cast_predicate  s    	* 	* 	* r#   c                     d }|S )Nc                     d| v rdddS dS )Nzmlp.gate.gate@   rL   )
group_sizebitsTrZ   )pathr   s     r!   r&  z(Model.quant_predicate.<locals>.predicate  s     $&&&(!4444r#   rZ   r'  s     r!   quant_predicatezModel.quant_predicate  s    	 	 	
 r#   r   c                 8   |pt           j                                        }|                                }| j        j        D ]W}t          |j        j        d|          |j        _        t          |j        j	        d|          |j        _	        t          |j        j
        d|          |j        _
        t          |j        j        d|          |j        _        |j        xj        |z  c_        |j        xj        |z  c_        |j        j        r&t          |j        j        d|          |j        _        t!          |j        t$                    rtt          |j        j        d|          |j        _        t          |j        j        d|          |j        _        t          |j        j        d|          |j        _        ||j        _        t/          |j        j        j        d|           t/          |j        j        j        d|           t/          |j        j        j        d|           t/          |j        j        j        d|           t/          |j        j        j        d|           t/          |j        j        j        d|           Yd S )Nzall-to-shardedr   zsharded-to-all)r   r   initsizer  r   r
   r   r   r   r   r   r   r   rJ   r   r   r   rg   rn   ro   rp   r   r	   r   r   )r*   r   Nr  s       r!   shardzModel.shard  s   .,,..JJLLZ& 2	 2	E%1&(8& & &EO" &2&(8& & &EO" &2&(8& & &EO" &2&(8& & &EO" O%%!+%%O((Q.((6 )5O*,<E* * *& %)Z00 &2I')9' ' '	# %1I%'7u% % %	! '3I')9' ' '	## ,1	(I*46Fe    I*24DE    I*46Fe    I(24DE    I(02B%    I(24DE    a2	 2	r#   r'   )r/   r0   r1   r7   r)   r   r3   r   r   r   r.   propertyr   r  r#  r(  r/  r   Groupr4  r4   r5   s   @r!   r  r  x  s        PY P P P P P P &*! !! S	"! ! ! ! ! ! X!
 
 
" " "H   X   X6 68BN$89 6 6 6 6 6 6 6 6r#   r  ).dataclassesr   	functoolsr   typingr   r   r   r   mlx.corecorer   mlx.nnr   mlx.nn.layers.distributedr	   r
   r   activationsr   r   r   r   r   r   r   r   
rope_utilsr   switch_layersr   r   compiler"   Moduler%   r7   r\   rg   r   r   r   r   r   r   r  rZ   r#   r!   <module>rC     s7   " ! ! ! ! !       , , , , , , , , , , , ,             P P P P P P P P P P       T T T T T T T T T T + + + + + + + + ' ' ' ' ' ' , , , , , , , , 	t$$$  %$3 3 3 3 3BI 3 3 3 & & & & & & & &>: : : : :") : : :J J J J J J J J, > > >"
 
 
 
 
RY 
 
 
*. . . . . . . .b\5 \5 \5 \5 \5ry \5 \5 \5~+ + + + +") + + +\. . . . .29 . . .bH H H H HBI H H H H Hr#   