
    )j                     L   d dl mZ d dlmZmZmZmZ d dlmZ	 d dl
mZ ddlmZmZmZ ddlmZ e G d de                      Z G d	 d
ej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  ZdS )    )	dataclass)AnyDictOptionalUnionN   )BaseModelArgscreate_attention_maskscaled_dot_product_attention)	SwitchGLUc                      e Zd ZU eed<   dZeed<   dZeed<   dZeed<   dZ	eed	<   dZ
eed
<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeeeeeef         f                  ed<   dZeed<   d ZdS )	ModelArgs
model_typei }  
vocab_sizei   hidden_sizei 8  intermediate_size    num_hidden_layersnum_attention_heads   num_experts_per_tok   num_key_value_headsnum_local_expertsgh㈵>rms_norm_epsg    .A
rope_thetaFrope_traditionalNrope_scalingtie_word_embeddingsc                 0    | j         | j        | _         d S d S N)r   r   selfs    _/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/mixtral.py__post_init__zModelArgs.__post_init__   s$    #+'+'?D$$$ ,+    )__name__
__module____qualname__str__annotations__r   intr   r   r   r   r   r   r   r   floatr   r   boolr   r   r   r   r   r%    r&   r$   r   r      s#        OOOJK"s"""s!!!!        sL%J"d""";?L(4U5#:%6 678??? %%%%@ @ @ @ @r&   r   c            	       x     e Zd Zdef fdZ	 	 d	dej        deej                 dee         dej        fdZ	 xZ
S )
MixtralAttentionargsc                    t                                                       |j        | _        |j        | _        | j        | j        z  | _        |j        | _        |j        | _        | j        dz  | _        t          j
        | j        | j        | j        z  d          | _        t          j
        | j        | j        | j        z  d          | _        t          j
        | j        | j        | j        z  d          | _        t          j
        | j        | j        z  | j        d          | _        t          j        | j        |j        |j                  | _        d S )Ng      Fbias)traditionalbase)super__init__r   r   	num_headshead_dimr   r   scalennLinearq_projk_projv_projo_projRoPEr   roper#   r2   	__class__s     r$   r9   zMixtralAttention.__init__$   sC   +1(DN:#'#; /]D(
idnt}<5
 
 
 id6FU
 
 
 id6FU
 
 
 iNT]*D,<5
 
 
 GM-
 
 
			r&   Nxmaskcachereturnc                 |   |j         \  }}}|                     |          |                     |          |                     |          }	}}|                    ||| j        d                              dddd          }|                    ||| j        d                              dddd          }|	                    ||| j        d                              dddd          }	|R|                     ||j	                  }|                     ||j	                  }|
                    ||	          \  }}	n*|                     |          }|                     |          }t          |||	|| j        |          }
|
                    dddd                              ||d          }
|                     |
          S )Nr   r   r      )offset)rI   r<   rH   )shaper?   r@   rA   reshaper:   	transposer   rD   rN   update_and_fetchr   r<   rB   )r#   rG   rH   rI   BLDquerieskeysvaluesoutputs              r$   __call__zMixtralAttention.__call__A   s    '1a $AAAv //!Q;;EEaAqQQ||Aq$":B??II!QPQSTUU1d&>CCMMq!Q
 
 iii==G99T%,977D 11$??LD&&ii((G99T??D-T6djt
 
 
 !!!Q1--55aB??{{6"""r&   NNr'   r(   r)   r   r9   mxarrayr   r   rZ   __classcell__rF   s   @r$   r1   r1   #   s        
Y 
 
 
 
 
 
@ $(#	# #8# rx # }	#
 
# # # # # # # #r&   r1   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )MixtralSparseMoeBlockr2   c                 >   t                                                       |j        | _        |j        | _        |j        | _        |j        | _        t          j
        | j        | j        d          | _        t          | j        | j        | j                  | _        d S NFr4   )r8   r9   r   
hidden_dimr   ffn_dimr   num_expertsr   r=   r>   gater   
switch_mlprE   s     r$   r9   zMixtralSparseMoeBlock.__init__b   s    *-1#'#;  Idot/?eLLL	#DOT\4CSTTr&   rG   rJ   c                 r   |                      |          }| j        }t          j        t          j        | |dz
  d          dd |f                   }t          j        ||d          }t          j        |dd          }|                     ||          }||d         z                      d	          }|S )
Nr   rL   )kthaxis.)rl   T)rl   precise).N)	rh   r   r]   stop_gradientargpartitiontake_along_axissoftmaxri   sum)r#   rG   gateskindsscoresys          r$   rZ   zMixtralSparseMoeBlock.__call__n   s    		!$AE K K KCQSRSQSG TUU#E4b999FT:::OOAt$$	""''R'00r&   )	r'   r(   r)   r   r9   r]   r^   rZ   r_   r`   s   @r$   rb   rb   a   sq        
UY 
U 
U 
U 
U 
U 
U"( rx        r&   rb   c            	       x     e Zd Zdef fdZ	 	 d	dej        deej                 dee         dej        fdZ	 xZ
S )
MixtralDecoderLayerr2   c                 D   t                                                       |j        | _        t          |          | _        t          |          | _        t          j        |j        |j	                  | _
        t          j        |j        |j	                  | _        d S )Neps)r8   r9   r   r1   	self_attnrb   block_sparse_moer=   RMSNormr   input_layernormpost_attention_layernormrE   s     r$   r9   zMixtralDecoderLayer.__init__}   s    +)$// 5d ; ;!z$*:@QRRR(*
$"3)
 )
 )
%%%r&   NrG   rH   rI   rJ   c                     |                      |                     |          ||          }||z   }|                     |                     |                    }||z   }|S r!   )r~   r   r   r   )r#   rG   rH   rI   rhouts          r$   rZ   zMixtralDecoderLayer.__call__   s_     NN4//22D%@@E!!$"?"?"B"BCC!e
r&   r[   r\   r`   s   @r$   rz   rz   |   s        

Y 

 

 

 

 

 

 $(#	
 
8
 rx 
 }	

 

 
 
 
 
 
 
 
r&   rz   c                   h     e Zd Zdef fdZ	 	 ddej        deej                 dej        fdZ xZ	S )	MixtralModelr2   c                 V   t                                                       j        | _        j        | _        t	          j        j        j                  | _        fdt          j                  D             | _	        t	          j
        j        j                  | _        d S )Nc                 0    g | ]}t                     S ))r2   )rz   ).0_r2   s     r$   
<listcomp>z)MixtralModel.__init__.<locals>.<listcomp>   s2     
 
 
/0T***
 
 
r&   r|   )r8   r9   r   r   r=   	Embeddingr   embed_tokensrangelayersr   r   normrE   s    `r$   r9   zMixtralModel.__init__   s    /!%!7L$:JKK
 
 
 
49$:P4Q4Q
 
 
 Jt/T5FGGG			r&   Ninputsinput_embeddingsrJ   c                    ||}n|                      |          }|d gt          | j                  z  }t          ||d                   }t	          | j        |          D ]\  }} ||||          }|                     |          S )Nr   )r   lenr   r
   zipr   )r#   r   rI   r   r   rH   layercs           r$   rZ   zMixtralModel.__call__   s     ' AA!!&))A=FS---E$Qa11DK// 	" 	"HE1aq!!AAyy||r&   r[   )
r'   r(   r)   r   r9   r]   r^   r   rZ   r_   r`   s   @r$   r   r      s        	HY 	H 	H 	H 	H 	H 	H /3	  #28,	
 
       r&   r   c                        e Zd Zdef fdZ	 	 d
dej        deej                 dej        fdZd Z	e
d	             Z xZS )Modelr2   c                     t                                                       || _        |j        | _        t	          |          | _        |j        s(t          j        |j	        |j
        d          | _        d S d S rd   )r8   r9   r2   r   r   modelr   r=   r>   r   r   lm_headrE   s     r$   r9   zModel.__init__   sq    	/!$''
' 	T9T%5tUSSSDLLL	T 	Tr&   Nr   r   rJ   c                     |                      |||          }| j        j        r| j         j                            |          S |                     |          S r!   )r   r2   r   r   	as_linearr   )r#   r   rI   r   r   s        r$   rZ   zModel.__call__   sR     jj(8999( 	%:*44S999<<$$$r&   c           
      `   | j         j        r                    dd            dvrS t          | j         j                  D ]g}d| dD ]]\  }dD ]U d d v rGfdt          | j         j                  D             }t          j        |           d	| d <   V^hS )
Nzlm_head.weightz3model.layers.0.block_sparse_moe.experts.0.w1.weightzmodel.layers.))w1	gate_proj)w2	down_proj)w3up_proj)weightscalesbiasesz.block_sparse_moe.experts.0..c                 P    g | ]"}                      d | d d           #S )z.block_sparse_moe.experts.r   )pop)r   eru   nprefixweightss     r$   r   z"Model.sanitize.<locals>.<listcomp>   sZ     # # # !" $KK#) P PQ P P P PQ P P # # #r&   z.block_sparse_moe.switch_mlp.)r2   r   r   r   r   r   r]   stack)r#   r   lmto_joinru   r   r   s    `   @@@r$   sanitizezModel.sanitize   s+   9( 	0KK($///@OONty233 	 	A(Q((FU  17 
 
A EEaEE!EEPP# # # # # # # &+49+F%G%G	# # # HW--  6 O O O OA O OP
 r&   c                     | j         j        S r!   )r   r   r"   s    r$   r   zModel.layers   s    z  r&   r[   )r'   r(   r)   r   r9   r]   r^   r   rZ   r   propertyr   r_   r`   s   @r$   r   r      s        TY T T T T T T /3	
% 
%
% #28,	
%
 

% 
% 
% 
%  * ! ! X! ! ! ! !r&   r   )dataclassesr   typingr   r   r   r   mlx.corecorer]   mlx.nnr=   r7   r	   r
   r   switch_layersr   r   Moduler1   rb   rz   r   r   r/   r&   r$   <module>r      s   " ! ! ! ! ! - - - - - - - - - - - -             T T T T T T T T T T $ $ $ $ $ $ @ @ @ @ @ @ @ @*;# ;# ;# ;# ;#ry ;# ;# ;#|    BI   6    ")   4    29   D,! ,! ,! ,! ,!BI ,! ,! ,! ,! ,!r&   