
    )j2                        d dl Z d dlmZ d dlmZmZmZmZmZ d dl	m
Z d dlmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ dd	lmZ e G d
 de                      Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z  G d dej                  Z! G d dej                  Z" G d dej                  Z#dS )    N)	dataclass)AnyDictListOptionalUnion   )swiglu)BaseModelArgscreate_attention_maskscaled_dot_product_attention)KVCacheRotatingKVCache)initialize_rope)	SwitchGLUc                      e Zd ZU eed<   ee         ed<   dZeed<   dZeed<   dZ	eed<   d	Z
eed
<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeeeeeef         f                  ed<   dZeed<   dZeed<   dZeed<   d Zeed!<   d"Zeed#<   d$Zeed%<   d&Zeed'<   d(Zeed)<   d Z eed*<   d Z!eed+<   dZ"eed,<   d$Z#eed-<   dS ).	ModelArgs
model_typelayer_typesi  
vocab_sizei   hidden_sizei   intermediate_sizei   moe_intermediate_size    num_hidden_layersnum_attention_heads   num_key_value_heads@   head_dimi   max_position_embeddingsgh㈵>rms_norm_epsi'  
rope_thetaNrope_scalingFtie_word_embeddings   num_experts   num_experts_per_tokr	   num_shared_experts   num_dense_layersT
route_normgS㥛@route_scalesigmoid
score_funcn_group
topk_groupsliding_windowmup_enabled)$__name__
__module____qualname__str__annotations__r   r   intr   r   r   r   r   r   r    r!   r"   floatr#   r$   r   r   r   r%   boolr'   r)   r*   r,   r-   r.   r0   r1   r2   r3   r4        ]/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/afmoe.pyr   r      s        OOOcJK!s!!!!%3%%%s!!!!    Hc#)S)))L%J;?L(4U5#:%6 678??? %%%%K    cJKJGSJNCKr>   r   c            	       ~     e Zd Zddedef fdZ	 	 ddej        deej                 dee	         d	ej        fd
Z
 xZS )	AttentionFargsis_local_attentionc                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        || _        | j        dz  | _	        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        z  | j        d          | _        t          j        | j        |j                  | _        t          j        | j        |j                  | _        t          j        | j        | j        | j        z  d          | _        |r.t+          | j        |j        d|j        |j                  | _        d S d | _        d S )Ng      Fbiaseps)super__init__r   r   n_headsr   
n_kv_headsr    rC   scalennLinearq_projk_projv_projo_projRMSNormr"   q_normk_norm	gate_projr   r#   r$   r!   rope)selfrB   rC   	__class__s      r?   rJ   zAttention.__init__1   s   +/2"4]D(
idlT]:
 
 
 ido=E
 
 
 ido=E
 
 
 iL4=($*:
 
 
 jD4EFFFjD4EFFFdlT]:
 
 
  		'!, DIII DIIIr>   Nxmaskcachereturnc                 f   |j         \  }}}|                     |          }|                     |          }|                     |          }	|                    ||| j        | j                                      dddd          }|                    ||| j        | j                                      dddd          }|	                    ||| j        | j                                      dddd          }	| 	                    |          }| 
                    |          }| j        rl| j        e|9|                     ||j                  }|                     ||j                  }n*|                     |          }|                     |          }||                    ||	          \  }}	t          |||	|| j        |          }
|
                    dddd                              ||d          }
t#          j        |                     |                    }|
|z  }
|                     |
          S )Nr   r+   r	      )offset)r]   rM   r\   )shaperP   rQ   rR   reshaperK   r    	transposerL   rU   rV   rC   rX   ra   update_and_fetchr   rM   mxr/   rW   rS   )rY   r[   r\   r]   BLDquerieskeysvaluesoutputgates               r?   __call__zAttention.__call__[   s
    '1a++a..{{1~~Q//!QdmDDNNq!Q
 
 ||Aq$/4=AAKKAqRSUVWW1dot}EEOOq!Q
 
 ++g&&{{4  " 	'ty'< ))GEL)AAyyely;;))G,,yy 11$??LD&-T6djt
 
 
 !!!Q1--55aB??z$..++,,${{6"""r>   FNN)r5   r6   r7   r   r<   rJ   rg   arrayr   r   rp   __classcell__rZ   s   @r?   rA   rA   0   s        ( (Y (D ( ( ( ( ( (Z $(#	+# +#8+# rx +# }	+#
 
+# +# +# +# +# +# +# +#r>   rA   c                   L     e Zd Zddedee         f fdZdej        fdZ	 xZ
S )MLPNrB   r   c                    t                                                       |j        }||n|j        }t	          j        ||d          | _        t	          j        ||d          | _        t	          j        ||d          | _        d S NFrE   )	rI   rJ   r   r   rN   rO   rW   	down_projup_proj)rY   rB   r   dim
hidden_dimrZ   s        r?   rJ   zMLP.__init__   s     !, ' 	 3
???:s???yju===r>   r^   c                     |                      t          |                     |          |                     |                              S N)rz   r
   rW   r{   rY   r[   s     r?   rp   zMLP.__call__   s4    ~~fT^^A%6%6QHHIIIr>   r   )r5   r6   r7   r   r   r:   rJ   rg   rs   rp   rt   ru   s   @r?   rw   rw      sz        > >Y >8C= > > > > > >JRX J J J J J J J Jr>   rw   c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )	MoERouterz;Router module that wraps the gate for proper weight naming.rB   c                     t                                                       t          j        |j        |j        d          | _        d S ry   )rI   rJ   rN   rO   r   r'   ro   rY   rB   rZ   s     r?   rJ   zMoERouter.__init__   s;    Id.0@uMMM			r>   r[   r^   c                 ,    |                      |          S r   )ro   r   s     r?   rp   zMoERouter.__call__   s    yy||r>   )
r5   r6   r7   __doc__r   rJ   rg   rs   rp   rt   ru   s   @r?   r   r      sw        EENY N N N N N N"( rx        r>   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )AfmoeMoErB   c                    t                                                       || _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _	        t          |          | _        t          j        |j        f          | _        t          |j        |j        |j                  | _        |j        dk    r'|j        |j        z  }t)          ||          | _        d S d S )Nr   )r   )rI   rJ   rB   r'   r)   r-   r.   r0   r1   r2   r   routerrg   zerosexpert_biasr   r   r   expertsr*   rw   shared_experts)rY   rB   shared_intermediate_sizerZ   s      r?   rJ   zAfmoeMoE.__init__   s    	+#'#; /+/|/oo8T%5$788 &
 
 "Q&&*T-DD % #&d>V"W"W"WD	 '&r>   r[   r^   c                    |                      |          }| j        dk    r2t          j        |                    t          j                            }n3t          j        |                    t          j                  d          }|| j        z   }| j        dk    rt          j	        |d| j        df          }t          j
        |dd                              dd          }| j        | j        z
  }t          j        ||dz
  d	
          dd |d d f         }t          j        |t          j        |          t          j        d          d	          }t          j        |d	d          }| j        }t          j        | |dz
  d
          dd |f         }t          j        ||d          }	| j        r'| j        dk    r|	                    dd          }
|	|
z  }	|	| j        z  }	|                     ||          }||	d         z                      d	                              |j                  }| j        j        dk    r||                     |          z   }|S )Nr/   rb   )axisr	   )r   rc   r+   T)r   keepdims)kthr   .g        ).Nr   )r   r0   rg   r/   astypefloat32softmaxr   r1   	unflattentopksumr2   argpartitionput_along_axisstop_gradientrs   flattenr)   take_along_axisr-   r.   r   dtyperB   r*   r   )rY   r[   gatesscoresselection_scoresgroup_scoresk	group_idxindsselected_scoresdenominatorys               r?   rp   zAfmoeMoE.__call__   sf   A?i''ZRZ 8 899FFZRZ 8 8rBBBF "D$44 <!!| r$,1C      7#3QR@@@DD$ E  L t.A!a%bIII#rPQrSTSTST*UI!0 ""29"="=rx}}SU       "z*:BCC $ 00a!e"EEEc2A2gN,VTCCC? 	<t7!;;)--2-EEK-;O)D,<<LLD!!++00b099@@II9'!++D''***Ar>   	r5   r6   r7   r   rJ   rg   rs   rp   rt   ru   s   @r?   r   r      sq        XY X X X X X X6,"( ,rx , , , , , , , ,r>   r   c            	            e Zd Zddededef fdZ	 	 ddej        de	ej                 d	e	e
         d
ej        fdZ xZS )DecoderLayerFrB   	layer_idxuse_slidingc                 8   t                                                       |j        | _        || _        || _        t          ||          | _        ||j        k     rt          |          | _	        nt          |          | _	        t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )N)rC   rG   )rI   rJ   r   r   r   rA   	self_attnr,   rw   mlpr   rN   rT   r"   input_layernormpost_attention_layernormpre_mlp_layernormpost_mlp_layernorm)rY   rB   r   r   rZ   s       r?   rJ   zDecoderLayer.__init__   s    +&""4KHHHt,,,4yyDHH~~DH!z$*:@QRRR(*
$"3)
 )
 )
% "$D,<$BS!T!T!T"$*T-=4CT"U"U"Ur>   Nr[   r\   r]   r^   c                    |                      |                     |          ||          }|                     |          }||z   }|                     |                     |                    }|                     |          }||z   S r   )r   r   r   r   r   r   )rY   r[   r\   r]   rhs         r?   rp   zDecoderLayer.__call__  s~     NN4//22D%@@))!,,EHHT++A..//##A&&1ur>   rq   rr   )r5   r6   r7   r   r:   r<   rJ   rg   rs   r   r   rp   rt   ru   s   @r?   r   r      s        V VY V3 VT V V V V V V. $(#	 8 rx  }	
 
       r>   r   c                   >     e Zd Zdef fdZ	 ddej        fdZ xZS )
AfmoeModelrB   c                 f   t                                                       | _        j        | _        j        | _        j        | _        j        | _        j        | _        j        | _        t          j
        j        j                  | _        fdt          | j                  D             | _        t          j        j        j                  | _        | j                            d          | _        d | _        t          | j                  D ]\  }}|j        r
|| _         d S d S )Nc                 B    g | ]\  }}t          ||d k              S )sliding_attention)rB   r   r   )r   ).0idx
layer_typerB   s      r?   
<listcomp>z'AfmoeModel.__init__.<locals>.<listcomp>"  sK     
 
 
  Z SjDW6W  
 
 
r>   rG   full_attention)rI   rJ   rB   r   r   r   r3   r4   r   rN   	Embeddingembed_tokens	enumeratelayersrT   r"   normindexfa_idxswa_idxr   )rY   rB   r   layerrZ   s    `  r?   rJ   zAfmoeModel.__init__  s4   	/!%!7+"1++L$:JKK
 
 
 
 $-T-=#>#>	
 
 
 Jt/T5FGGG	&,,-=>>#DK00 	 	JC  "	 	r>   Ninputsc                    |                      |          }| j        r|t          j        | j                  z  }|d gt          | j                  z  }t          ||| j                           }d }| j	        "t          ||| j	                 | j
                  }t          | j        |          D ]\  }}|j        r|n|} ||||          }|                     |          S )N)window_size)r]   )r   r4   mathsqrtr   lenr   r   r   r   r3   zipr   r   )	rY   r   r]   r   fa_maskswa_maskr   cr\   s	            r?   rp   zAfmoeModel.__call__1  s    
 f%% 	0DId.///A=FS---E'5+=>><#,5&D4G  H DK// 	( 	(HE1$0=88gDaQ'''AAyy||r>   r   r   ru   s   @r?   r   r     sl        Y      :         r>   r   c                        e Zd Zdef fdZ	 ddej        fdZd Ze	d             Z
d Ze	d	             Ze	d
             Z xZS )ModelrB   c                     t                                                       || _        |j        | _        t	          |          | _        |j        s(t          j        |j	        |j
        d          | _        d S d S ry   )rI   rJ   rB   r   r   modelr%   rN   rO   r   r   lm_headr   s     r?   rJ   zModel.__init__M  sq    	/%%
' 	T9T%5tUSSSDLLL	T 	Tr>   Nr   c                     |                      ||          }| j        j        r | j         j                            |          }n|                     |          }|S r   )r   rB   r%   r   	as_linearr   )rY   r   r]   outs       r?   rp   zModel.__call__V  sT    
 jj''9( 	$*)33C88CC,,s##C
r>   c           
         d                                  D             | j        j        r                    dd            t	          | j        j                  D ]u}|| j        j        k     rd| dD ]ZdD ]U d d v rGfdt	          | j        j                  D             }t          j	        |           d	 d <   V[vS )
Nc                 "    i | ]\  }}d |v	||S )zrotary_emb.inv_freqr=   )r   r   vs      r?   
<dictcomp>z"Model.sanitize.<locals>.<dictcomp>d  s*    VVVDAq7LTU7U7U1a7U7U7Ur>   zlm_head.weightzmodel.layers.)r{   rz   rW   )weightscalesbiasesz.mlp.experts.0..c                 P    g | ]"}                      d | d d           #S ).mlp.experts.r   )pop)r   er   nprefixweightss     r?   r   z"Model.sanitize.<locals>.<listcomp>q  sS     # # # ! $KK6(K(K(K(KA(K(K(K(KLL# # #r>   r   )
itemsrB   r%   r   ranger   r,   r'   rg   stack)rY   r   lto_joinr   r   r   s    `  @@@r?   sanitizezModel.sanitizeb  sL   VVGMMOOVVV9( 	0KK($/// ty233 	U 	UA49---(Q((F: U U7 U UA 8888Q88GCC# # # # # # #%*49+@%A%A# # # DF8GCTCT6 ? ? ? ?A ? ?@UU r>   c                     | j         j        S r   )r   r   rY   s    r?   r   zModel.layersy  s    z  r>   c                 *      fd j         D             S )Nc                 n    g | ]1}|j         rt          j        j                   nt	                      2S ))max_size)r   r   r   r3   r   )r   r   rY   s     r?   r   z$Model.make_cache.<locals>.<listcomp>~  sO     
 
 
  $)BCCCCYY	
 
 
r>   )r   r   s   `r?   
make_cachezModel.make_cache}  s2    
 
 
 
 
 
 
 	
r>   c                     d }|S )Nc                 
    d| vS )Nr   r=   )r   s    r?   	predicatez'Model.cast_predicate.<locals>.predicate  s     ))r>   r=   rY   r   s     r?   cast_predicatezModel.cast_predicate  s    	* 	* 	* r>   c                     d }|S )Nc                     d| v rdddS dS )Nzrouter.gater   r(   )
group_sizebitsTr=   )path_s     r?   r   z(Model.quant_predicate.<locals>.predicate  s     $$&(!4444r>   r=   r   s     r?   quant_predicatezModel.quant_predicate  s    	 	 	
 r>   r   )r5   r6   r7   r   rJ   rg   rs   rp   r   propertyr   r   r   r  rt   ru   s   @r?   r   r   L  s        TY T T T T T T 
 

 
 
 
  . ! ! X!
 
 
   X   X    r>   r   )$r   dataclassesr   typingr   r   r   r   r   mlx.corecorerg   mlx.nnrN   activationsr
   baser   r   r   r]   r   r   
rope_utilsr   switch_layersr   r   ModulerA   rw   r   r   r   r   r   r=   r>   r?   <module>r     sv    ! ! ! ! ! ! 3 3 3 3 3 3 3 3 3 3 3 3 3 3                   T T T T T T T T T T + + + + + + + + ' ' ' ' ' ' $ $ $ $ $ $        <V# V# V# V# V#	 V# V# V#rJ J J J J") J J J&    	   H H H H Hry H H HV! ! ! ! !29 ! ! !H3 3 3 3 3 3 3 3lI I I I IBI I I I I Ir>   