
    )j                     `   d dl Z d dlmZ d dlmZmZmZmZ d dlm	Z
 d dlmZ ddlmZmZmZ ddlmZ ddlmZ e G d d	e                      Z G d
 dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  ZdS )    N)	dataclass)DictListOptionalUnion   )BaseModelArgscreate_attention_maskscaled_dot_product_attention)SuScaledRoPE)	SwitchGLUc                       e Zd ZU dZeed<   dZeed<   dZeed<   dZ	eed<   d	Z
eed
<   d	Zeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeeeeee         f         f         ed<   dZeed<   dZeed<   dZeed<   dS )	ModelArgsphimoe
model_typei@}  
vocab_sizei   hidden_sizei   intermediate_size    num_hidden_layersnum_attention_heads   num_key_value_headsi   max_position_embeddings original_max_position_embeddingsgư>rms_norm_epsNrope_scaling   num_local_experts   num_experts_per_tokg     @
rope_theta)__name__
__module____qualname__r   str__annotations__r   intr   r   r   r   r   r   r   r   floatr   r   r   r   r   r!   r"        ^/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/phimoe.pyr   r      s        JJK!s!!!s!!!!    #)S))),0$c000L%9=L$sE%e"4556===s    Jr+   r   c                   h     e Zd Zdef fdZ	 	 ddej        deej                 dej        fdZ xZ	S )		Attentionargsc                 \   t                                                       |j        }|j        x| _        }|j        x| _        }|j        |z  }|dz  | _        t          j	        |||z  d          | _
        t          j	        |||z  d          | _        t          j	        |||z  d          | _        t          j	        ||z  |d          | _        t          ||j        |j        |j        |j        d         |j        d         |j        d         |j        d                   | _        d S )	Ng      Tbiasshort_factorlong_factorshort_mscalelong_mscale)baser   r   r3   r4   r5   r6   )super__init__r   r   n_headsr   
n_kv_headsscalennLinearq_projk_projv_projo_projr   r"   r   r   r   rope)selfr/   dimr:   r;   head_dim	__class__s         r,   r9   zAttention.__init__!   s%   !%!99w'+'??*#w.t^
iWx%7dCCCiZ(%:FFFiZ(%:FFFi( 2CdCCC $($@-1-R*>:)-8*>:)-8	
 	
 	
			r+   Nxmaskreturnc                 |   |j         \  }}}|                     |          |                     |          |                     |          }	}}|                    ||| j        d                              dddd          }|                    ||| j        d                              dddd          }|	                    ||| j        d                              dddd          }	|R|                     ||j	                  }|                     ||j	                  }|
                    ||	          \  }}	n*|                     |          }|                     |          }t          |||	|| j        |          }
|
                    dddd                              ||d          }
|                     |
          S )Nr   r    r      )offset)cacher<   rI   )shaper?   r@   rA   reshaper:   	transposer;   rC   rN   update_and_fetchr   r<   rB   )rD   rH   rI   rO   BLDquerieskeysvaluesoutputs              r,   __call__zAttention.__call__;   s    '1a $AAAv //!Qb99CCAq!QOO||Aq$/266@@Aq!LL1dor::DDQ1aPPiii==G99T%,977D 11$??LD&&ii((G99T??D-T6djt
 
 
 !!!Q1--55aB??{{6"""r+   NN
r#   r$   r%   r   r9   mxarrayr   r[   __classcell__rG   s   @r,   r.   r.       s        
Y 
 
 
 
 
 
: $(	# #8# rx #
 
# # # # # # # #r+   r.   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )PhiMoESparseMoeBlockr/   c                 >   t                                                       |j        | _        |j        | _        |j        | _        |j        | _	        t          j        | j        | j        d          | _        t          | j        | j        | j                  | _        d S )NFr1   )r8   r9   r   
hidden_dimr   ffn_dimr   num_expertsr!   top_kr=   r>   gater   
switch_mlprD   r/   rG   s     r,   r9   zPhiMoESparseMoeBlock.__init__Z   s}    *-1-
Idot/?eLLL	#DOT\4CSTTr+   rH   rJ   c                 r   |                      |          }| j        }t          j        t          j        | |dz
  d          dd |f                   }t          j        ||d          }t          j        |dd          }|                     ||          }||d         z                      d	          }|S )
Nr   rL   )kthaxis.)rn   T)rn   precise).N)	ri   rh   r^   stop_gradientargpartitiontake_along_axissoftmaxrj   sum)rD   rH   gateskindsscoresys          r,   r[   zPhiMoESparseMoeBlock.__call__d   s    		!JAE K K KCQSRSQSG TUU#E4b999FT:::OOAt$$	""''R'00r+   	r#   r$   r%   r   r9   r^   r_   r[   r`   ra   s   @r,   rc   rc   Y   sq        UY U U U U U U"( rx        r+   rc   c                   h     e Zd Zdef fdZ	 	 ddej        deej                 dej        fdZ xZ	S )	PhiMoEDecoderLayerr/   c                 D   t                                                       |j        | _        t          |          | _        t          |          | _        t          j        |j        |j	                  | _
        t          j        |j        |j	                  | _        d S )Neps)r8   r9   r   r.   	self_attnrc   block_sparse_moer=   	LayerNormr   input_layernormpost_attention_layernormrk   s     r,   r9   zPhiMoEDecoderLayer.__init__s   s    +"4 4T : :!|D,<$BSTTT(*$"3)
 )
 )
%%%r+   NrH   rI   rJ   c                     |}|                      |          }|                     |||          }||z   }|}|                     |          }|                     |          }||z   }|S )N)rI   rO   )r   r   r   r   )rD   rH   rI   rO   residualhidden_statess         r,   r[   zPhiMoEDecoderLayer.__call__~   sx     ,,Q//}4uMM =0 55mDD--m<< =0r+   r\   r]   ra   s   @r,   r}   r}   r   s        	
Y 	
 	
 	
 	
 	
 	
 $(	 8 rx 
 
       r+   r}   c                   L     e Zd Zdef fdZ	 ddej        dej        fdZ xZS )PhiMoEModelr/   c                 L   t                                                       | _        j        | _        t	          j        j        j                  | _        fdt          j	                  D             | _
        t	          j        j        j                  | _        d S )Nc                 .    g | ]}t                    S r*   )r}   ).0_r/   s     r,   
<listcomp>z(PhiMoEModel.__init__.<locals>.<listcomp>   s"    WWWA)$//WWWr+   r   )r8   r9   r/   r   r=   	Embeddingr   embed_tokensranger   layersr   r   normrk   s    `r,   r9   zPhiMoEModel.__init__   s    	/L$:JKKWWWWt?U9V9VWWWL!1t7HIII			r+   NinputsrJ   c                    |                      |          }|d gt          | j                  z  }t          ||d                   }t	          | j        |          D ]\  }} ||||          }|                     |          S )Nr   )r   lenr   r
   zipr   )rD   r   rO   hrI   layercs          r,   r[   zPhiMoEModel.__call__   s    
 f%%=FS---E$Qa11DK// 	" 	"HE1aq!!AAyy||r+   Nr{   ra   s   @r,   r   r      s        JY J J J J J J   
	       r+   r   c                   Z     e Zd Zdef fdZ	 ddej        fdZd Ze	d             Z
 xZS )	Modelr/   c                     t                                                       |j        | _        || _        t	          |          | _        t          j        |j        |j	        d          | _
        d S )NTr1   )r8   r9   r   r/   r   modelr=   r>   r   r   lm_headrk   s     r,   r9   zModel.__init__   sY    /	 &&
y!14?NNNr+   Nr   c                 X    |                      ||          }|                     |          S r   )r   r   )rD   r   rO   outs       r,   r[   zModel.__call__   s)    
 jj''||C   r+   c           
         dvrS t          | j        j                  D ]g}d| dD ]]\  }dD ]U d d v rGfdt          | j        j                  D             }t	          j        |           d| d <   V^hS )	Nz3model.layers.0.block_sparse_moe.experts.0.w1.weightzmodel.layers.))w1	gate_proj)w2	down_proj)w3up_proj)weightscalesbiasesz.block_sparse_moe.experts.0..c                 P    g | ]"}                      d | d d           #S )z.block_sparse_moe.experts.r   )pop)r   erw   nprefixweightss     r,   r   z"Model.sanitize.<locals>.<listcomp>   sZ     # # # !" $KK#) P PQ P P P PQ P P # # #r+   z.block_sparse_moe.switch_mlp.)r   r/   r   r   r^   stack)rD   r   lmto_joinrw   r   r   s    `   @@@r,   sanitizezModel.sanitize   s	   @OONty233 	 	A(Q((FU  17 
 
A EEaEE!EEPP# # # # # # # &+49+F%G%G	# # # HW--  6 O O O OA O OP
 r+   c                     | j         j        S r   )r   r   )rD   s    r,   r   zModel.layers   s    z  r+   r   )r#   r$   r%   r   r9   r^   r_   r[   r   propertyr   r`   ra   s   @r,   r   r      s        OY O O O O O O ! !! ! ! !  ( ! ! X! ! ! ! !r+   r   )mathdataclassesr   typingr   r   r   r   mlx.corecorer^   mlx.nnr=   r7   r	   r
   r   
rope_utilsr   switch_layersr   r   Moduler.   rc   r}   r   r   r*   r+   r,   <module>r      s    ! ! ! ! ! ! . . . . . . . . . . . .             T T T T T T T T T T $ $ $ $ $ $ $ $ $ $ $ $                "6# 6# 6# 6# 6#	 6# 6# 6#r    29   2       >    ")   6&! &! &! &! &!BI &! &! &! &! &!r+   