
    )jJ                     B   d dl mZ d dlmZmZmZmZ d dlmZ	 d dl
mZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ e G d de                      Z G d dej                  Z G d dej                  Z G d dej                  Z  G d dej                  Z! G d dej                  Z" G d dej                  Z# G d dej                  Z$ G d dej                  Z% G d dej                  Z& G d d ej                  Z'dS )!    )	dataclass)AnyListOptionalTupleN   )swiglu)BaseModelArgscreate_attention_maskcreate_ssm_maskscaled_dot_product_attention)ArraysCacheKVCache)initialize_rope)
ssm_update)	SwitchGLUc                   $   e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed	<   eed
<   eed<   eed<   eed<   ee         ed<   eed<   eed<   dZ	e
e         ed<   dZe
e         ed<   dZe
e         ed<   dZe
e         ed<   dZe
e         ed<   dZe
e         ed<   dZe
e         ed<   dZe
e         ed<   dZe
e         ed<   dZe
e         ed<   dZeed<   dZeed<   d Zeed!<   d"Zeeef         ed#<   ed$efd%            ZdS )&	ModelArgs
model_type
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersmax_position_embeddingsnum_attention_headsnum_key_value_headsattention_biasembedding_multiplierattention_multiplierlogits_scalingresidual_multiplierlayer_typesrms_norm_eps
rope_thetaNnum_local_expertsnum_experts_per_tokshared_intermediate_sizemamba_n_headsmamba_d_headmamba_proj_biasmamba_d_statemamba_d_convmamba_n_groupsmamba_conv_biasFmlp_biasropeposition_embedding_typeTtie_word_embeddings)gMbP?g      Y@time_step_limitreturnc                 *    t          | j                  S N)boolr%   selfs    h/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/granitemoehybrid.pyuse_moezModelArgs.use_moeB   s    D*+++    )__name__
__module____qualname__str__annotations__intr7   floatr   r%   r   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r1   r2   r3   r   propertyr;    r<   r:   r   r      s         OOOOOO    c (,x}+++)-#---.2hsm222 $(M8C='''"&L(3-&&&&*OXd^***#'M8C='''"&L(3-&&&$(NHSM(((&*OXd^*** Hd $*S))) $$$$+9OU5%<(999 , , , , X, , ,r<   r   c                   ^     e Zd Zd
dedef fdZddej        dej        dej        fd	Z xZ	S )GraniteMoeHybridRMSNormGatedư>r   epsc                     t                                                       || _        t          j        |          | _        d S r6   )super__init__rI   mxonesweight)r9   r   rI   	__class__s      r:   rL   z%GraniteMoeHybridRMSNormGated.__init__H   s5    gk**r<   Nhidden_statesgater4   c                 |    |t          ||          }t          j                            || j        | j                  S r6   )r	   rM   fastrms_normrO   rI   )r9   rQ   rR   s      r:   __call__z%GraniteMoeHybridRMSNormGated.__call__M   s5    "477Mwt{DHEEEr<   )rH   r6   )
r=   r>   r?   rB   rC   rL   rM   arrayrV   __classcell__rP   s   @r:   rG   rG   G   s        + +C +e + + + + + +
F Fbh Fbh F"( F F F F F F F Fr<   rG   c                   <    e Zd Zdef fdZdej        dee         deej                 dej        fdZ	dej        d	ej        d
ej        dej        dee         deej                 dej        fdZ
	 ddej        deej                 dee         dej        fdZ xZS )GraniteMoeHybridMamba2Mixerargsc                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        |j        |j	        z  | _
        |j        | _        |j	        | _        |j        | _        | j        | j        z  | _        | j
        d| j        z  | j        z  z   | _        t#          j        | j        | j        |j        d| j        |j                  | _        | j
        | j        z   | j        z   }t#          j        | j        ||j                  | _        t1          j        | j                  | _        t1          j        t1          j        d| j        dz   t0          j                            | _        t1          j        | j                  | _        tA          | j
        |j!                  | _"        t#          j        | j
        | j        |j                  | _#        d S )N   r   )in_channelsout_channelskernel_sizepaddinggroupsbiasrd   r   dtyperI   )$rK   rL   r(   	num_headsr   r+   ssm_state_sizer,   conv_kernel_sizer)   r   r-   n_groupshead_dimr3   heads_per_groupconv_dimnnConv1dr.   conv1dLinearr*   in_projrM   rN   dt_biaslogarangefloat32A_logDrG   r#   normout_proj)r9   r\   projection_sizerP   s      r:   rL   z$GraniteMoeHybridMamba2Mixer.__init__T   s   ++"0 $ 1!%!3d6G!G+)#3#~>.T]1BTEX1XXi)=%
 
 
 04=@4>QyoD4H
 
 
 wt~..VBIa!);2:NNNOO
((0"(9
 
 
	 	"D$44;O
 
 
r<   
conv_inputcachemaskr4   c                    |t          j        |d         |d          }||d         7t          j        |j        d         | j        dz
  | j        f|j                  }n|d         }t          j        ||gd          }| j        dz
  }|j        m|j        d         }t          j	        |j        d||z
            }|d d d f         t          j
        |          z   d         }	t          j        ||	d          |d<   n8|d d | d d d f         |d<   n"t          j        |d| j        dz
  dfdg          }|                     |          }
t          j        |
          S )N.Nr   r   rf   axis)r   r   )rM   wherezerosshaperk   ro   rg   concatenatelengthscliprw   take_along_axispadrr   rp   silu)r9   r~   r   r   
conv_statepadded_inputn_keeptends	positionsconv_outputs              r:   _convz!GraniteMoeHybridMamba2Mixer._conv{   s}    $y/:qAAJQxX%a($*?!*CT]S$*  


 #1X
>:z*BKKKL*Q.F}( &q)wu}aV<<!!!!T']RYv->->>	J	-lIANNNa'F788QQQ7a6Vd&;a&?%CVL L kk,//w{###r<   rQ   BCdtc                    |j         \  }}}	|                    ||| j        | j                  }|                    ||| j        | j                  }|                    ||| j        | j                  }|r|d         }
|j        }nd\  }
}t          || j        ||| j	        
                    |j                  || j        |
| j        |
  
        \  }}
|r|
|d<   |                    ||| j                  S )Nr   NN)r   reshaperi   rm   rl   rj   r   r   ry   rz   astyperg   ru   r3   r   )r9   rQ   r   r   r   r   r   
batch_sizeseq_len_stater   ys                r:   _ssmz GraniteMoeHybridMamba2Mixer._ssm   s    "/!4
GQ%--
 
 IIj'4=$:MNNIIj'4=$:MNN 	(!HEmGG'NE7JFMM--..L 
 
5  	E!HyyWd.DEEEr<   Nc                    |                      |          }t          j        || j        | j        | j        z   gd          \  }}}|                     |||          }t          j        || j        | j        | j        | j        z  z   gd          \  }	}
}|                     |	|
||||          }|r |	                    |j
        d                    |                     ||          }|                     |          S )Nr   r   )rt   rM   splitr   ro   r   rl   rj   r   advancer   r{   r|   )r9   rQ   r   r   	projectedrR   r~   r   r   hidden_states_ssmr   r   r   s                r:   rV   z$GraniteMoeHybridMamba2Mixer.__call__   s    LL//	!x#T%;dm%KL 
  
  
j"
 jjUD99"$(&&9L)LL #
 #
 #
1a II'Ar5$?? 	&MM!'!*%%%IIa}}Qr<   r6   )r=   r>   r?   r   rL   rM   rW   r   r   r   r   rV   rX   rY   s   @r:   r[   r[   S   s^       %
Y %
 %
 %
 %
 %
 %
N $H $ $ $ rx 	 $
 
 $  $  $  $D%Fx%F 8%F 8	%F
 H%F $%F rx %F 
%F %F %F %FV (,	   x  rx   $	 
 
               r<   r[   c            	       x     e Zd Zdef fdZ	 	 d	dej        deej                 dee         dej        fdZ	 xZ
S )
GraniteMoeHybridAttentionr\   c                 H   t                                                       |j        }|j        x| _        }|j        x| _        }|j        |z  x| _        }|j        | _	        |j
        }t          j        |||z  |          | _        t          j        |||z  |          | _        t          j        |||z  |          | _        t          j        ||z  ||          | _        |j        dk    }|r)t%          | j        |j        dd |j                  | _        d S d | _        d S )Nre   nopeF)rK   rL   r   r   n_headsr   
n_kv_headsrm   r   scaler   rp   rs   q_projk_projv_projo_projr1   r   r$   r   r0   )	r9   r\   dimr   r   rm   r   use_roperP   s	           r:   rL   z"GraniteMoeHybridAttention.__init__   s(   !%!99w'+'??*#'#3w#>>.
,iWx%7nMMMiZ(%:PPPiZ(%:PPPi( 2CnMMM /69 		', DIII DIIIr<   Nxr   r   r4   c                    |j         \  }}}|                     |          |                     |          |                     |          }	}}|                    ||| j        d                              dddd          }|                    ||| j        d                              dddd          }|	                    ||| j        d                              dddd          }	| j        e|9|                     ||j	                  }|                     ||j	                  }n*|                     |          }|                     |          }||
                    ||	          \  }}	t          |||	|| j        |          }
|
                    dddd                              ||d          }
|                     |
          S )Nr   r   r^   r      )offset)r   r   r   )r   r   r   r   r   r   	transposer   r0   r   update_and_fetchr   r   r   )r9   r   r   r   r   Lrz   querieskeysvaluesoutputs              r:   rV   z"GraniteMoeHybridAttention.__call__  s    '1a $AAAv//!Qb99CCAq!QOO||Aq$/266@@Aq!LL1dor::DDQ1aPP 9  ))GEL)AAyyely;;))G,,yy 11$??LD&-T6djt
 
 
 !!!Q1--55aB??{{6"""r<   r   )r=   r>   r?   r   rL   rM   rW   r   r   rV   rX   rY   s   @r:   r   r      s        Y      B $(#'	# #8# rx #  	#
 
# # # # # # # #r<   r   c                   B     e Zd Zdededef fdZdej        fdZ xZS )GraniteMoeHybridTopKGating
input_sizenum_expertstop_kc                     t                                                       || _        || _        || _        t          j        ||d          | _        d S NFre   )rK   rL   r   r   r   rp   rs   layer)r9   r   r   r   rP   s       r:   rL   z#GraniteMoeHybridTopKGating.__init__#  sK    &$
Yz;UCCC


r<   rQ   c                     |                      |          }t          j        || j         d          d| j         d f         }t          j        ||d          }t          j        |dd          }||fS )Nr   )kthr   .r   T)preciser   )r   rM   argpartitionr   r   softmax)r9   rQ   logits	top_k_idxtop_k_logitstop_k_gatess         r:   rV   z#GraniteMoeHybridTopKGating.__call__*  s|    M**OF"EEE$*
	 )&)"EEEjt"EEE+%%r<   )	r=   r>   r?   rB   rL   rM   rW   rV   rX   rY   s   @r:   r   r   "  sx        D3 DS D D D D D D D&bh & & & & & & & &r<   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )GraniteMoeHybridMoEr\   c                    t                                                       |j        | _        |j        | _        t          | j        | j        |j                  | _        t          | j        |j        |j	                  | _
        d S )N)r   r   r   )rK   rL   r   r   r   r   r%   
switch_mlpr   r&   routerr9   r\   rP   s     r:   rL   zGraniteMoeHybridMoE.__init__5  s{    *1#OT-t/E
 
 1.*
 
 
r<   r   r4   c                     |                      |          \  }}|                     ||          }||d         z                      d          S )Nr   r   )r   r   sum)r9   r   	token_idsgatesr   s        r:   rV   zGraniteMoeHybridMoE.__call__C  sJ    ;;q>>	5OOAy))E)$$))r)222r<   	r=   r>   r?   r   rL   rM   rW   rV   rX   rY   s   @r:   r   r   4  sj        
Y 
 
 
 
 
 
3"( 3rx 3 3 3 3 3 3 3 3r<   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )GraniteMoeHybridSharedMLPr\   c                     t                                                       t          j        |j        |j        dz  d          | _        t          j        |j        |j        d          | _        d S )Nr^   Fre   )rK   rL   rp   rs   r   r'   input_linearoutput_linearr   s     r:   rL   z"GraniteMoeHybridSharedMLP.__init__J  sr    Id;a?e
 
 
  Y)4+;%
 
 
r<   r   r4   c                     t          j        |                     |          dd          \  }}|                     t	          ||                    S )Nr^   r   r   )rM   r   r   r   r	   )r9   r   rR   ups       r:   rV   z"GraniteMoeHybridSharedMLP.__call__S  sH    8D--a00!"===b!!&r"2"2333r<   r   rY   s   @r:   r   r   I  sj        
Y 
 
 
 
 
 
4"( 4rx 4 4 4 4 4 4 4 4r<   r   c                   :     e Zd Zdef fdZdej        fdZ xZS )GraniteMoeHybridMLPr\   c                    t                                                       |j        }|j        }|j        }t          j        |||          | _        t          j        |||          | _        t          j        |||          | _	        d S )Nre   )
rK   rL   r   r   r/   rp   rs   	gate_proj	down_projup_proj)r9   r\   r   
hidden_dimr/   rP   s        r:   rL   zGraniteMoeHybridMLP.__init__Y  s|    +
=3
BBB:sBBByjx@@@r<   r4   c                     |                      t          |                     |          |                     |                              S r6   )r   r	   r   r   )r9   r   s     r:   rV   zGraniteMoeHybridMLP.__call__c  s4    ~~fT^^A%6%6QHHIIIr<   r   rY   s   @r:   r   r   X  sq        AY A A A A A AJRX J J J J J J J Jr<   r   c            	       |     e Zd Zdedef fdZ	 	 d
dej        deej                 dee	         dej        fd	Z
 xZS )GraniteMoeHybridLayerr\   
layer_typec                 2   t                                                       || _        |j        | _        |j        | _        t          j        |j        |j                  | _	        |dk    rt          |          | _        n-|dk    rt          |          | _        nt          d|           | j        r)t          |          | _        t#          |          | _        nt'          |          | _        t          j        |j        |j                  | _        d S )Nrh   mamba	attentionzUnknown layer type: )rK   rL   r   r!   r;   rp   RMSNormr   r#   input_layernormr[   r   r   	self_attn
ValueErrorr   
shared_mlpr   block_sparse_moer   mlppost_attention_layernorm)r9   r\   r   rP   s      r:   rL   zGraniteMoeHybridLayer.__init__h  s	   $#'#; |!z$*:@QRRR  4T::DJJ;&&6t<<DNN@J@@AAA < 	17==DO$7$=$=D!! +400DH(*
$"3)
 )
 )
%%%r<   Nr   r   r   r4   c                    |}|                      |          }| j        dk    r|                     |||          }n|                     |||          }||| j        z  z   }|}|                     |          }| j        r0|                     |          }|                     |          }||z   }	n| 	                    |          }	||	| j        z  z   }|S )Nr   r   r   )
r   r   r   r   r!   r   r;   r   r   r   )
r9   r   r   r   residualrQ   normedmoe_out
shared_outmlp_outs
             r:   rV   zGraniteMoeHybridLayer.__call__  s     ,,Q//?g%% JJ}4uJMMMM NN=t5NQQM =43K#KK !..}==< 	'++F33G00J
*GGhhv&&G 7T-E#EEr<   r   )r=   r>   r?   r   r@   rL   rM   rW   r   r   rV   rX   rY   s   @r:   r   r   g  s        
Y 
C 
 
 
 
 
 
< $(#	 8 rx  }	
 
       r<   r   c                   \     e Zd Zdef fdZ	 ddej        dee         dej        fdZ	 xZ
S )	GraniteMoeHybridModelr\   c                    t                                                       | _        t          j        j        j                  | _        fdj        D             | _	        t          j
        j        j                  | _        j        | _        dj        v rj                            d          nd | _        dj        v rj                            d          nd | _        d S )Nc                 0    g | ]}t          |          S rE   )r   ).0r   r\   s     r:   
<listcomp>z2GraniteMoeHybridModel.__init__.<locals>.<listcomp>  s1     
 
 
8B!$
33
 
 
r<   rh   r   r   )rK   rL   r\   rp   	Embeddingr   r   embed_tokensr"   layersr   r#   r{   r   indexfa_idxssm_idxr   s    `r:   rL   zGraniteMoeHybridModel.__init__  s    	L$:JKK
 
 
 
FJFV
 
 
 Jt/T5FGGG	$($=!
 d... "";/// 	 07$:J/J/JD""7+++PT 	r<   Ninputsr   r4   c                    |                      |          | j        z  }|d gt          | j                  z  }d }d }| j        t          ||| j                           }| j        t          ||| j                           }t          | j        |          D ]"\  }}|j	        dk    r|n|} ||||          }#| 
                    |          S )Nr   r   )r	  r   lenr
  r  r   r  r   zipr   r{   )	r9   r  r   rQ   	attn_mask
mamba_maskr   cr   s	            r:   rV   zGraniteMoeHybridModel.__call__  s    
 ))&11D4MM=FS---E 	
;"-mU4;=OPPI<#(dl8KLLJDK// 	E 	EHE1 % 0K ? ?99ZD!E-d!DDDMMyy'''r<   r6   )r=   r>   r?   r   rL   rM   rW   r   r   rV   rX   rY   s   @r:   r  r    s        
Y 
 
 
 
 
 
.  $( (( }( 
	( ( ( ( ( ( ( (r<   r  c                        e Zd Zdef fdZ	 ddej        dee         dej        fdZ	e
d             Zd	 Zd
 Ze
d             Z xZS )Modelr\   c                    t                                                       || _        |j        | _        t	          |          | _        |j        s&t          j        |j	        |j
        d          | _        |j        | _        d S r   )rK   rL   r\   r   r  modelr2   rp   rs   r   r   lm_headr    r   s     r:   rL   zModel.__init__  sr    	/*400
' 	T9T%5tUSSSDL"1r<   Nr  r   r4   c                     |                      ||          }| j        j        r | j         j                            |          }n|                     |          }|| j        z  S )N)r   )r  r\   r2   r	  	as_linearr  r    )r9   r  r   outs       r:   rV   zModel.__call__  s_    
 jjuj--9( 	$*)33C88CC,,s##CT(((r<   c                     | j         j        S r6   )r  r
  r8   s    r:   r
  zModel.layers  s    z  r<   c                     g }| j         D ]]}|j        dk    r$|                    t          d                     1|j        dk    r!|                    t	                                 ^|S )Nr   r^   )sizer   )r
  r   appendr   r   )r9   cachesr   s      r:   
make_cachezModel.make_cache  sp    [ 	) 	)E7**kq1112222![00gii(((r<   c                    |                                 D ]3\  }}d|v r*|j        d         dk    r|                    dd          ||<   4| j        j        rd|v rt          | j        j                  D ]}d| d}|                    | d          }|j        \  }}}|d d d |dz  d d f         }	|d d |dz  d d d f         }
|	|| d	<   |
|| d
<   |                    | d          || d<   n| j        j        sd|v rt          | j        j                  D ]k}d| d}|                    | d          }t          j	        |dd          \  }	}
|	|d| d<   |
|d| d<   |                    | d          |d| d<   l|S )Nzconv1d.weightr   r   r^   z3model.layers.0.block_sparse_moe.input_linear.weightzmodel.layers.z.block_sparse_moez.input_linear.weightz.switch_mlp.gate_proj.weightz.switch_mlp.up_proj.weightz.output_linear.weightz.switch_mlp.down_proj.weightz-model.layers.0.shared_mlp.input_linear.weightz.shared_mlpr   r   z.mlp.gate_proj.weightz.mlp.up_proj.weightz.mlp.down_proj.weight)
itemsr   moveaxisr\   r;   ranger   poprM   r   )r9   weightskvlprefixinput_weightr   expert_hiddenr   r   s              r:   sanitizezModel.sanitize  sK   MMOO 	. 	.DAq!##q(8(8ZZ1--
 I%	EPP49677  ====&{{f+J+J+JKK&2&8#=! ),@mq.@,@!!!)CD	&qqq-1*<*>*>'ABCL6???@AH6===>CJ;;444D D6???@@$ 	!	?7JJ49677  7777  '{{f+J+J+JKK%'XlAA%F%F%F"	7DM@@@@ABI>>>>?DKKK444E E@@@@AA r<   c                       fd}|S )Nc                 T    j         j        r|                     d          rdddS dS )Nzrouter.layer@      )
group_sizebitsT)r\   r;   endswith)pathr   r9   s     r:   	predicatez(Model.quant_predicate.<locals>.predicate*  s6    y  5T]]>%B%B 5&(!4444r<   rE   )r9   r8  s   ` r:   quant_predicatezModel.quant_predicate(  s$    	 	 	 	 	
 r<   r6   )r=   r>   r?   r   rL   rM   rW   r   r   rV   rD   r
  r"  r/  r9  rX   rY   s   @r:   r  r    s        2Y 2 2 2 2 2 2  $) )) }) 
	) ) ) ) ! ! X!  . . .`   X    r<   r  )(dataclassesr   typingr   r   r   r   mlx.corecorerM   mlx.nnrp   activationsr	   baser
   r   r   r   r   r   r   
rope_utilsr   ssmr   switch_layersr   r   ModulerG   r[   r   r   r   r   r   r   r  r  rE   r<   r:   <module>rE     s   " ! ! ! ! ! - - - - - - - - - - - -                              ( ' ' ' ' ' ' ' ' ' ' ' ' '       $ $ $ $ $ $ -, -, -, -, -, -, -, -,`	F 	F 	F 	F 	F29 	F 	F 	FL  L  L  L  L ") L  L  L ^=# =# =# =# =#	 =# =# =#@& & & & & & & &$3 3 3 3 3") 3 3 3*4 4 4 4 4	 4 4 4J J J J J") J J J: : : : :BI : : :z,( ,( ,( ,( ,(BI ,( ,( ,(^\ \ \ \ \BI \ \ \ \ \r<   