
    )jrS                        d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	 d dl
mZ d dlmZ d dlmZmZ ddlmZmZmZ ddlmZmZ e G d	 d
e                      Ze G d de                      Z G d dej                  Z G d dej                  Z G d dej                  Z eej        d          d             Z  G d dej                  Z! G d dej                  Z" G d dej                  Z# eej        d          d             Z$ G d dej                  Z% G d d ej                  Z& G d! d"ej                  Z'dS )#    N)	dataclass)partial)AnyDictListOptional)tree_flattentree_unflatten   )BaseModelArgscreate_attention_maskscaled_dot_product_attention)KVCacheRotatingKVCachec                   2   e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed	<   eed
<   eed<   eed<   eed<   eed<   eed<   eed<   ee         ed<   ee         ed<   eed<   eed<   eed<   eed<   eed<   eed<   dZ	e
e         ed<   dS )
TextConfig
model_typehidden_sizenum_hidden_layersintermediate_sizenum_attention_headshead_dimrms_norm_eps
vocab_sizenum_key_value_headsnum_kv_shared_layersvocab_size_per_layer_inputsliding_windowmax_position_embeddingsrope_local_base_freq
rope_thetafinal_logit_softcappinglayer_typesactivation_sparsity_patternhidden_size_per_layer_inputaltup_num_inputsaltup_coef_clipaltup_correct_scalealtup_active_idxlaurel_rankNrope_scaling)__name__
__module____qualname__str__annotations__intfloatr   boolr+   r   r        _/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/gemma3n.pyr   r      sE        OOOMMMOOO ####    """"c!%e,,,!$$$$#'L(4.'''''r5   r   c                   $    e Zd ZU eed<   eed<   dS )	ModelArgsr   text_configN)r,   r-   r.   r/   r0   dictr4   r5   r6   r8   r8   -   s'         OOOr5   r8   c                   ,     e Zd Zddef fdZd Z xZS )
RMSNoScaleh㈵>epsc                 V    t                                                       || _        d S N)super__init__r>   )selfr>   	__class__s     r6   rB   zRMSNoScale.__init__4   s$    r5   c                 N    t           j                            |d | j                  S r@   )mxfastrms_normr>   )rC   xs     r6   __call__zRMSNoScale.__call__8   s    w4222r5   )r=   )r,   r-   r.   r2   rB   rJ   __classcell__rD   s   @r6   r<   r<   3   sX         E      3 3 3 3 3 3 3r5   r<   c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )Gemma3nLaurelBlockz Learned Augmented Residual Layerconfigc                 t   t                                                       || _        t          j        | j        j        | j        j        d          | _        t          j        | j        j        | j        j        d          | _        t          j	        | j        j        | j        j
                  | _        d S )NFbiasdimsr>   )rA   rB   rO   nnLinearr   r*   linear_leftlinear_rightRMSNormr   post_laurel_normrC   rO   rD   s     r6   rB   zGemma3nLaurelBlock.__init__?   s    9K#T[%<5
 
 
 IK#T[%<5
 
 
 !#
((!
 !
 !
r5   rI   returnc                     |                      |          }|                     |          }|                     |          }||z   S r@   )rW   rX   rZ   )rC   rI   laurel_xnormed_laurel_xs       r6   rJ   zGemma3nLaurelBlock.__call__N   sF    ##A&&$$X..//99?""r5   )
r,   r-   r.   __doc__r   rB   rF   arrayrJ   rK   rL   s   @r6   rN   rN   <   sp        **
z 
 
 
 
 
 
#"( #rx # # # # # # # #r5   rN   c            	            e Zd Zdededef fdZ	 	 ddej        de	ej                 de	e
         d	ej        fd
Z xZS )Gemma3nAttentionrO   	layer_idxis_kv_shared_layerc                 2   t                                                       |j        |         dk    | _        |j        }|j        x| _        }|j        x| _        }||z  | _	        |j
        x| _
        }|| _        d| _        t          j        |||z  d          | _        t          j        |||z  d          | _        t          j        |||z  d          | _        t          j        ||z  |d          | _        t          j        |j
        |j                  | _        t          j        |j
        |j                  | _        t/          |j                  | _        || _        t          j        |d| j        r|j        n|j                  | _        d S )Nsliding_attention      ?FrQ   rS   r>   )traditionalbase)rA   rB   r#   
is_slidingr   r   n_headsr   
n_kv_headsrepeatsr   rd   scalerU   rV   q_projk_projv_projo_projrY   r   q_normk_normr<   v_normre   RoPEr    r!   rope)	rC   rO   rd   re   dimrm   rn   r   rD   s	           r6   rB   zGemma3nAttention.__init__V   sz    ,Y7;NN !'!;;w'-'AA**,#)?2"
iWx%7eDDDiZ(%:GGGiZ(%:GGGi( 2CeDDDjfo6;NOOOjfo6;NOOO V%8999"4G/3U++FDU	
 
 
			r5   NrI   maskcacher\   c                    |j         \  }}}|                     |          }|                    ||d| j                  }|                     |          }d}| j        r||j        \  }	}
|j        }n||j        }|                     |                              ||d| j                  }	| 	                    |	          }	|	
                    dddd          }	|                     |	|          }	|                     |                              ||d| j                  }
|                     |
          }
|

                    dddd          }
||                    |	|
          \  }	}
|
                    dddd          }|                     ||          }t          ||	|
|| j        |          }|
                    dddd                              ||d          }|                     |          S )Nr      r      )offset)r|   rp   r{   )shaperq   reshaper   ru   re   stater   rr   rv   	transposery   rs   rw   update_and_fetchr   rp   rt   )rC   rI   r{   r|   BL_queriesr   keysvaluesoutputs               r6   rJ   zGemma3nAttention.__call__v   s    '1a++a..//!QDM::++g&&" 	Du'8 ;LD&\FF  ;;q>>))!QDMBBD;;t$$D>>!Q1--D99T&911D[[^^++Aq"dmDDF[[((F%%aAq11F $55dFCCf##Aq!Q//))GF)33-T6djt
 
 
 !!!Q1--55aB??{{6"""r5   NNr,   r-   r.   r   r1   r3   rB   rF   ra   r   r   rJ   rK   rL   s   @r6   rc   rc   U   s        
z 
c 
t 
 
 
 
 
 
F $(#	*# *#8*# rx *# }	*#
 
*# *# *# *# *# *# *# *#r5   rc   T)	shapelessc                     t          j        | dd          }t          j        | dd          }|||                    |j                  z  z   }t          j        t          j        d| |z
                      S )Nr~   Taxiskeepdimsr   )rF   meanstdastypedtyperU   gelu_approxmaximum)inputsstd_multiplierinputs_mean
inputs_stdcutoff_xs        r6   	gelu_topkr      so    '&rD999KR$777JZ.*?*?
@P*Q*QQQH>"*Q(9::;;;r5   c                   @     e Zd Zddedef fdZdej        fdZ xZ	S )MLPr   rO   rd   c                    t                                                       || _        |j        | _        t	          |j        t                    r|j        |         n|j        | _        t          j        | j        | j        d          | _	        t          j        | j        | j        d          | _
        t          j        | j        | j        d          | _        |j        |j        |         | _        nd| _        | j        dk    r;t          j        d          t!          j        d| j        z  dz
            z  | _        d S d S )NFrQ           r   g       @r   r   )rA   rB   rO   r   
isinstancer   listrU   rV   	gate_projup_proj	down_projr$   activation_sparsitymathsqrtrF   erfinv_std_multiplier)rC   rO   rd   rD   s      r6   rB   zMLP.__init__   s/   !- &2D99*F$Y//) 	
 4#3T5KRWXXXy!143IPUVVV4#94;KRWXXX-9'-'I)'TD$$'*D$#a''#'9S>>BID,,q05 5 $D    ('r5   rI   c                     |                      |          }| j        dk    rt          || j                  }nt	          j        |          }|                     |          }|                     ||z            }|S )Nr   )r   r   r   r   rU   r   r   r   )rC   rI   r   activationsr   r   s         r6   rJ   zMLP.__call__   so    NN1%%	#c))#It/CDDKK.33K,,q//NN;#899	r5   )r   )
r,   r-   r.   r   r1   rB   rF   ra   rJ   rK   rL   s   @r6   r   r      sm         z c      *"(        r5   r   c                        e Zd ZdZdef fdZdej        dej        fdZdej        dej        fdZ	dej        d	ej        fd
Z
 xZS )Gemma3nAltUpzAlternating Updates (AltUp)rO   c                 "   t                                                       || _        t          j        | j        j        f          | _        t          j        | j        j	        | j        j	        d          | _
        t          j        | j        j	        | j        j	        dz  d          | _        t          j        | j        j        | j        j	        d          | _        t          j        | j        j        | j        j                  | _        d S )NFrQ   r   rS   )rA   rB   rO   rF   zerosr   correct_output_scalerU   rV   r&   correction_coefsprediction_coefsmodality_routerrY   r   router_normr[   s     r6   rB   zGemma3nAltUp.__init__   s    $&Hdk.E-G$H$H! "	K($+*FU!
 !
 !
 !#	K($+*F*IPU!
 !
 !
  "yK#T[%A 
  
  
 :((
 
 
r5   rI   r\   c                     |                      |          | j        j        dz  z  }|                     |                              t
          j                  }t          j        |          S )Ng      )r   rO   r   r   r   rF   float32tanh)rC   rI   router_inputsrouteds       r6   compute_router_modalitiesz&Gemma3nAltUp.compute_router_modalities   sU    ((++t{/F/LM%%m44;;BJGGwvr5   c                    |                      || j        j                           }| j        j                            t          j                  | j        _        | j        j        ?t          j	        | j        j        | j        j         | j        j                  | j        _         |                     |          j
        g |j        d d         | j        j        | j        j        R                      dddd          }|                    t          j                  }|                    dddd          }t          j        ||          }|                    dddd          }||z  }|                    |j                  S )Nr~   r   r   r   r   )r   rO   r)   r   weightr   rF   r   r'   clipr   r   r&   r   matmulr   )rC   rI   
modalities	all_coefsx_up
x_permutedpredictionss          r6   predictzGemma3nAltUp.predict   sg   33Adk6R4STT
'+'<'C'J'J2:'V'V$;&2+-7%,,,+, ,D!(D!!*--!#2#& , ,	   Yq!Q"" 	 xx
##^^Aq!Q//
i
I66!++Aq!Q77t!!!'***r5   r   	activatedc                    |                      |          }| j        j                            t          j                  | j        _        | j        j        ?t	          j        | j        j        | j        j         | j        j                  | j        _        |                     |          dz   }|| j        j	                 }||z
  }|
                    dd          }|d          |d         z  }||z  }|                    |j                  S )Nrh   r   r   ).N)r   r   r   r   rF   r   rO   r'   r   r)   moveaxisr   )rC   r   r   r   r   active_x
innovation	correcteds           r6   correctzGemma3nAltUp.correct  s    33I>>
'+'<'C'J'J2:'V'V$;&2+-7%,,,+, ,D!( ))*55;	t{;<)
&&q!,,	t$y';;	[ 		000r5   )r,   r-   r.   r`   r   rB   rF   ra   r   r   r   rK   rL   s   @r6   r   r      s        %%
z 
 
 
 
 
 
&28     
+ +bh + + + +:128 1 1 1 1 1 1 1 1 1r5   r   c            
            e Zd Zdededef fdZ	 	 	 ddej        de	ej                 de	e
         d	e	ej                 fd
Z xZS )Gemma3nDecoderLayerrO   rd   re   c                 h   t                                                       || _        |j        | _        || _        t          |||          | _        t          ||          | _        t          j
        | j        |j                  | _        t          j
        | j        |j                  | _        t          j
        | j        |j                  | _        t          j
        | j        |j                  | _        | j        j        | _        |j        | _        t%          |          | _        t)          |          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j
        | j        |j                  | _        d S )N)rd   ri   FrQ   )rA   rB   rO   r   rd   rc   	self_attnr   mlprU   rY   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormrl   r%   r   altuprN   laurelrV   per_layer_input_gateper_layer_projectionpost_per_layer_input_norm)rC   rO   rd   re   rD   s       r6   rB   zGemma3nDecoderLayer.__init__  s   !-")&)=OPPv333!z# 
  
  

 )+
#)
 )
 )
% *,#*
 *
 *
& +-*#+
 +
 +
' .3+1+M(!&))
(00$&Id>U%
 %
 %
! %'I,d.>U%
 %
 %
! *,#*
 *
 *
&&&r5   NrI   r{   r|   per_layer_inputc                 0   | j                             |          }|| j        j                 }|                     |          }|                     |          }|                     |||          }	|                     |	          }	||	z   }
|
|z   dz  }|                     |          }| 	                    |          }| 
                    |          }||z   }| j                             ||          }|| j        j                 }| j        j        r|| j         j        z  }|                     |          }t          j        |          }t#          j        ||          }|                     |          }|                     |          }|dd          |z   |dd <   |S )N;f?r   )r   r   rO   r)   r   r   r   r   r   r   r   r   r(   r   r   rU   r   rF   multiplyr   r   )rC   rI   r{   r|   r   r   active_predictionactive_prediction_normedlaurel_outputattn
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictions                     r6   rJ   zGemma3nDecoderLayer.__call__E  s    j((++'(DE#'#7#78I#J#J $<==~~$
 
 ,,T22&-
!M1i@22;??	88I&&77AA +m ; $
 2 2;@U V V01MN;* 	R/$*2QQ445EFF>*:;;;'7II445EFF99:JKK$9!""$=@P$Pabb!$$r5   NNNr   rL   s   @r6   r   r     s        '
z '
c '
t '
 '
 '
 '
 '
 '
X $(#.2,% ,%8,% rx ,% }	,%
 ""(+,% ,% ,% ,% ,% ,% ,% ,%r5   r   c                 >    t          j        || z            }|| z  }|S r@   )rF   r   )softcaprI   outs      r6   logit_softcapr   t  s#    
'!g+

C
-CJr5   c                        e Zd Zdef fdZ	 	 	 ddej        dej        fdZdej        dej        fd	Zd
ej        dej        dej        fdZ	d Z
 xZS )LanguageModelrO   c                 X    t                                                        _        j         _        j         _        j         _        j         _        j         _        j         _        j        j	        z
   _
        t          j        j        j                   _         fdt          j                  D              _        t          j        j        j        j        z             _        t          j        j        j        j        z  d           _        t          j        j        j                   _        fdt          d j        j                  D              _        fdt          d j        j                  D              _        t          j        j        j                   _        j                            d	           _        j                            d
           _        j         _        j        d  j
                 }t?          |          dz
  |d d d                             d
          z
  }t?          |          dz
  |d d d                             d	          z
  }g  _         tC           j        j                  D ]~\  }}| j
        k     r j         "                    |           +|d
k    r j         "                    |           L|d	k    r j         "                    |           mtG          d|           d S )Nc                 F    g | ]}t          ||j        k               S ))rO   rd   re   )r   first_kv_shared_layer_idx).0rd   rO   rC   s     r6   
<listcomp>z*LanguageModel.__init__.<locals>.<listcomp>  sK     
 
 
   ##,0N#N  
 
 
r5   FrQ   rS   c                 R    g | ]#}t          j        j        j        d           $S FrQ   rU   rV   r   r   r   rO   s     r6   r   z*LanguageModel.__init__.<locals>.<listcomp>  s@     "
 "
 "
 If(&*<5III"
 "
 "
r5   r   c                 R    g | ]#}t          j        j        j        d           $S r   r   r   s     r6   r   z*LanguageModel.__init__.<locals>.<listcomp>  s@     *
 *
 *
 If(&*<5III*
 *
 *
r5   ri   rg   full_attentionr~   Unknown layer type: )$rA   rB   rO   r   r%   r   r   r   r"   r   r   rU   	Embeddingembed_tokensrangelayersembed_tokens_per_layerrV   per_layer_model_projectionrY   r   per_layer_projection_normr&   altup_projectionsaltup_unembed_projectionsnormr#   indexfirst_sliding_idxfirst_full_idxr   lenlayer_idx_to_cache_idx	enumerateappendNotImplementedError)rC   rO   concrete_layersshared_full_idxshared_sliding_idxi
layer_typerD   s   ``     r6   rB   zLanguageModel.__init__|  ss   !-+1+M( +*0*K'!'!9'-'E$$v'BB 	& L):F<NOO
 
 
 
 
 #6#;<<
 
 
 ')l-$v'II'
 '
#
 +-)$v'II+
 +
 +
' *,3#*
 *
 *
&
"
 "
 "
 "
1dk:;;"
 "
 "

*
 *
 *
 *
1dk:;;*
 *
 *
&
 J#
 
 
	
 "(!3!9!9:M!N!N$0667GHH$3 ,-Mt/M-MN  1$ttt'<'B'BCS'T'TT 	   1$ttt'<'B'BCV'W'WW 	 ')#&t{'>?? 		S 		SMAz4111+2215555!111/66GGGG#666/667IJJJJ-.QZ.Q.QRRR		S 		Sr5   Nr   input_embeddingsc                    |!|                      |          | j        dz  z  }n|}|                     |          }|                     ||          }|d gt	          | j                  z  }t          ||| j                           }t          ||| j                 | j	                  }|t          j        dz  dd          dz  }g}	|	                    fd| j        D                        t          j        |	d	          }t          j        |d
d          dz  dd          dz  }
|d
d          |t          j        |
t          j        j                  j                  z  z  |d
d <   t'          | j                  D ]S\  }}|d d d d |d d f         }| j        j        |         dk    }|r|}n|} ||||| j        |                  |          }Tt          j        |d         dz  dd          dz  }t'          | j                  D ]\  }} |||d
z                      ||d
z   <    t          j        |d
d          dz  dd          dz  }
|d
d          |t          j        |
t          j        j                  j                  z  z  |d
d <   t          j        |d	          }|                     |          }| j                             |          }| j        t7          | j        |          }|S )N      ?)window_sizer   r~   Tr   c                 &    g | ]} |          S r4   r4   )r   projh0s     r6   r   z*LanguageModel.__call__.<locals>.<listcomp>  s!    CCCDttBxxCCCr5   r   )r   r   r   )r  r   get_per_layer_inputsproject_per_layer_inputsr  r  r   r  r  r   rF   r   extendr	  stackr   finfor   minr  rO   r#   r  r
  r  	as_linearr"   r   )rC   r   r|   r  hper_layer_inputsglobal_masksliding_window_masktarget_magnitudeh_listmagsr  layerr   	is_globalr{   r  r   r  s                     @r6   rJ   zLanguageModel.__call__  s>    #!!&))T-=s-BCAA A44V<<88<LMM=FS---E+$%&
 
 4$()+
 
 

  72q5rDAAASHCCCCD,BCCCDDDHV!$$$wquzT:::cA!"")BJtRXbh=O=O=S,T,TTU!""!$+.. 	 	HAu.qqq!!!Qz:O/26FFI +"*d1!45	 AA 71Q4192EEEL !?@@ 	& 	&GAttAa!eH~~Aa!eHHwquzT:::cA!"")BJtRXbh=O=O=S,T,TTU!""GAAiill))#..'3 <cBBC
r5   	input_idsr\   c                     || j         k     }t          j        ||t          j        |                    }|                     |          | j        dz  z  } |j        g |j        | j        | j        R  S )Nr  )	r   rF   where
zeros_liker  r%   r   r   r   )rC   r0  per_layer_inputs_masktokensresults        r6   r   z"LanguageModel.get_per_layer_inputs  s     )D,K K/BM)<T<TUU,,V44,c1
 v~ 
_
"
 ,
 
 
 	
r5   inputs_embedsr(  c                     |                      |          | j        dz  z  } |j        g |j        d d         | j        j        | j        j        R  }|                     |          }||z   dz  S )Ng      r~   r   )r  r   r   r   rO   r   r%   r  )rC   r7  r(  r   s       r6   r!  z&LanguageModel.project_per_layer_inputs  s    
  $>>}MMd" 
  <3;  
 "% 
K) 
 K3 
  
  

  $==>RSS$'77IFFr5   c                    g }| j         j        d | j                 D ]p}|dk    r"|                    t	                                 *|dk    r/|                    t          | j         j        d                     _t          d|           |S )Nr   rg   r   )max_sizekeepr  )rO   r#   r   r  r   r   r   r  )rC   cachesr  s      r6   
make_cachezLanguageModel.make_cache*  s    +12RD4R2RS 	O 	OJ---gii((((222#T[-GaPPP    **M*M*MNNNr5   r   )r,   r-   r.   r   rB   rF   ra   rJ   r   r!  r=  rK   rL   s   @r6   r   r   {  s       LSz LS LS LS LS LS LS`  %)	B BB (	B B B BH

bh 

28 

 

 

 

GxG (G 
	G G G G       r5   r   c                   `     e Zd Zdef fdZ	 	 ddej        deej                 fdZd Z	 xZ
S )	Gemma3nargsc                     t                                                       t          t                              |j                            | _        d S r@   )rA   rB   r   r   	from_dictr9   language_modelrC   r@  rD   s     r6   rB   zGemma3n.__init__9  s@    +J,@,@AQ,R,RSSr5   Nr   r  c                 2    |                      |||          S N)r|   r  )rC  rC   r   r|   r  s       r6   rJ   zGemma3n.__call__=  s)     ""%2B # 
 
 	
r5   c                 4    | j                                         S r@   )rC  r=  rC   s    r6   r=  zGemma3n.make_cacheG  s    "--///r5   r   )r,   r-   r.   r8   rB   rF   ra   r   rJ   r=  rK   rL   s   @r6   r?  r?  8  s        TY T T T T T T /3	
 

 #28,	
 
 
 
0 0 0 0 0 0 0r5   r?  c                   |     e Zd Zdef fdZ	 	 d
dej        deej                 fdZd Z	e
d             Zd	 Z xZS )Modelr@  c                     t                                                       || _        t          |          | _        |j        | _        d S r@   )rA   rB   r@  r?  modelr   rD  s     r6   rB   zModel.__init__L  s:    	T]]
/r5   Nr   r  c                 2    |                      |||          S rF  )rM  rG  s       r6   rJ   zModel.__call__R  s     zz&@PzQQQr5   c                     t          t          |                                                    }dD ]}|d                             |d            t	          t          |                    S )N)vision_toweraudio_towerembed_audioembed_visionrM  )r
   r   itemspopr:   r	   )rC   weightsks      r6   sanitizezModel.sanitizeZ  sb     gmmoo!6!677O 	* 	*AG  D))))L))***r5   c                 $    | j         j        j        S r@   )rM  rC  r  rI  s    r6   r  zModel.layers`  s    z(//r5   c                 4    | j                                         S r@   )rM  r=  rI  s    r6   r=  zModel.make_cached  s    z$$&&&r5   r   )r,   r-   r.   r8   rB   rF   ra   r   rJ   rX  propertyr  r=  rK   rL   s   @r6   rK  rK  K  s        *Y * * * * * * /3	R RR #28,	R R R R+ + + 0 0 X0' ' ' ' ' ' 'r5   rK  )(r   dataclassesr   	functoolsr   typingr   r   r   r   mlx.corecorerF   mlx.nnrU   	mlx.utilsr	   r
   rk   r   r   r   r|   r   r   r   r8   Moduler<   rN   rc   compiler   r   r   r   r   r   r?  rK  r4   r5   r6   <module>re     sK    ! ! ! ! ! !       , , , , , , , , , , , ,             2 2 2 2 2 2 2 2 T T T T T T T T T T + + + + + + + + ( ( ( ( ( ( ( (8        
3 3 3 3 3 3 3 3# # # # # # # #2K# K# K# K# K#ry K# K# K#\ 	t$$$< < %$<    ")   BL1 L1 L1 L1 L129 L1 L1 L1^V% V% V% V% V%") V% V% V%r 	t$$$  %$z z z z zBI z z zz0 0 0 0 0bi 0 0 0&' ' ' ' 'BI ' ' ' ' 'r5   