
    )j2                        d dl mZ d dlmZ d dlmZmZmZ d dlm	Z
 d dlmZ d dlmZmZmZ ddlmZmZmZ ddlmZ e G d	 d
e                      Zed             Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  ZdS )    )	dataclass)	lru_cache)AnyListOptionalN)shard_inplaceshard_linearsum_gradients   )BaseModelArgscreate_attention_maskscaled_dot_product_attention)	SwitchGLUc                       e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed	<   eed
<   eed<   eed<   eed<   eed<   dZeed<   dZ	eed<   dZ
ee         ed<   dZeed<   dS )	ModelArgs
model_typehidden_sizeintermediate_sizenum_attention_headsnum_key_value_headsmax_position_embeddingsnum_experts_per_toknum_local_expertsshared_intermediate_sizenum_hidden_layersrms_norm_eps
rope_theta
rotary_dim
vocab_sizeFtie_word_embeddingssigmoidscoring_funcNhead_dimTuse_qk_norm)__name__
__module____qualname__str__annotations__intfloatr    boolr"   r#   r   r$        _/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/minimax.pyr   r      s         OOO    !!!!OOOOOO %%%%!L#!!!"Hhsm"""Kr.   r   c                 p     t           j        d             t           j         fd            fd}|S )Nc                     |                      t          j                                                                      dd          S )NT)keepdims)astypemxfloat32squaresum)xs    r/   _cast_square_sumz*sharded_rms_norm.<locals>._cast_square_sum'   s5    xx
##**,,00d0CCCr.   c                 4   t           j                            |          }t          j        || j        d                                         z  z  |z             }|                     t           j                  |z  |z                      | j                  S )Ngroupr2   )	r5   distributedall_sumrsqrtshapesizer4   r6   dtype)r9   norm2wepsnormr=   s        r/   
_normalizez$sharded_rms_norm.<locals>._normalize+   sw    &&uE&::xuzz||!;<sBCC$$t+a/77@@@r.   c                 2     |  |           ||          S Nr-   )r9   rE   rF   r:   rH   s      r/   _inner_sharded_rms_normz1sharded_rms_norm.<locals>._inner_sharded_rms_norm4   s%    z!--a00!S999r.   )r5   compile)r=   rK   r:   rH   s   ` @@r/   sharded_rms_normrM   %   su    ZD D ZD ZA A A A ZA: : : : : : #"r.   c                        e Zd Z	 ddededeej        j                 f fdZ	d Z
d Zedd	deej        j                 fd
            Z xZS )ShardedRMSNormh㈵>NdimsrF   r=   c                    t                                                       |pt          j                                        }t          j        ||                                z  f          | _        || _        || _	        d S rJ   )
super__init__r5   r>   initonesrB   weightr=   rF   )selfrQ   rF   r=   	__class__s       r/   rT   zShardedRMSNorm.__init__;   sf     	.,,..gtuzz||3566
r.   c                 n    | j         j        d         | j                                        z   d| j         S )Nr   z, eps=)rW   rA   r=   rB   rF   rX   s    r/   _extra_reprzShardedRMSNorm._extra_reprD   s3    +#A&):)::LL$(LLLr.   c                 V     t          | j                  || d         | j                  S )NrW   )rM   r=   rF   )rX   r9   s     r/   __call__zShardedRMSNorm.__call__G   s'    +
++AtH~txHHHr.   r<   c                    | |j         j        d         |j        |          }t          j        t          j        |j         |                                d          |                                                   |_         |S )Nr   r<   r2   axis)rW   rA   rF   r5   
contiguoussplitrB   rank)clsnorm_moduler=   sns       r/   from_rms_normzShardedRMSNorm.from_rms_normJ   sm     S#)!,koUKKKMH['B???

M
 
	 	r.   )rP   N)r%   r&   r'   r*   r+   r   r5   r>   GrouprT   r\   r^   classmethodrh   __classcell__rY   s   @r/   rO   rO   :   s        TX #3;BN<P3Q     M M MI I I EI  $,R^-A$B   [    r.   rO   c            	       x     e Zd Zdef fdZ	 	 d	dej        deej                 dee         dej        fdZ	 xZ
S )
MiniMaxAttentionargsc                 V   t                                                       |j        x| _        }|j        | _        |j        | _        |j        p	||j        z  x| _        }|dz  | _        t          j	        |j        | j        |z  d          | _
        t          j	        |j        | j        |z  d          | _        t          j	        |j        | j        |z  d          | _        t          j	        | j        |z  |j        d          | _        t          |d          r|j        nd| _        | j        rPt          j        || j        z  |j                  | _        t          j        || j        z  |j                  | _        t          j        |j        d|j                  | _        d S )Ng      Fbiasr$   rF   )traditionalbase)rS   rT   r   
hidden_dimr   r   r#   scalennLinearq_projk_projv_projo_projhasattrr$   RMSNormr   q_normk_normRoPEr   r   rope)rX   ro   r   r#   rY   s       r/   rT   zMiniMaxAttention.__init__W   s   (,(88+#'#; #'#; MD[D,DD	
 t^
id6A
 
 
 id6A
 
 
 id6A
 
 
 i$x/1A
 
 
 07t]/K/KV4++QV 	*4339J  DK *4339J  DK GDOT_UUU			r.   Nr9   maskcachereturnc                    |j         \  }}}|                     |          |                     |          |                     |          }	}}| j        r*|                     |          }|                     |          }|                    ||| j        d          	                    dddd          }|                    ||| j
        d          	                    dddd          }|	                    ||| j
        d          	                    dddd          }	|R|                     ||j                  }|                     ||j                  }|                    ||	          \  }}	n*|                     |          }|                     |          }t          |||	|| j        |          }
|
	                    dddd                              ||d          }
|                     |
          S )Nr2   r      r      )offset)r   rw   r   )rA   rz   r{   r|   r$   r   r   reshaper   	transposer   r   r   update_and_fetchr   rw   r}   )rX   r9   r   r   BLDquerieskeysvaluesoutputs              r/   r^   zMiniMaxAttention.__call__{   s    '1a $AAAv 	%kk'**G;;t$$D//!Q(@"EEOOq!Q
 
 ||Aq$":B??II!QPQSTUU1d&>CCMMq!Q
 
 iii==G99T%,977D 11$??LD&&ii((G99T??D-T6djt
 
 
 !!!Q1--55aB??{{6"""r.   NNr%   r&   r'   r   rT   r5   arrayr   r   r^   rk   rl   s   @r/   rn   rn   V   s        "VY "V "V "V "V "V "VN $(#	$# $#8$# rx $# }	$#
 
$# $# $# $# $# $# $# $#r.   rn   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )MiniMaxSparseMoeBlockro   c                 B   t                                                       |j        | _        t          j        |j        |j        d          | _        t          |j        |j	        |j                  | _
        t          j        |j        f          | _        d | _        d S NFrq   )rS   rT   r   rx   ry   r   r   gater   r   
switch_mlpr5   zerose_score_correction_biassharding_grouprX   ro   rY   s     r/   rT   zMiniMaxSparseMoeBlock.__init__   s    #'#; Id.0FUSSS	#d4d6L
 
 (*x1G0I'J'J$"r.   r9   r   c                    | j          t          | j                   |          }|                     |                    t          j                            }t	          j        |          }|}|| j        z   }| j        }t	          j	        | |dz
  d          dd |f         }t	          j
        ||d          }|t	          j        |dd          dz   z  }|                    |j                  }|                     ||          }||d	         z                      d
          }| j         &t          j                            || j                   }|S )Nr   r2   )kthra   .r`   T)ra   r3   g#B;).Nr<   )r   r
   r   r4   r5   r6   r!   r   r   argpartitiontake_along_axisr8   rC   r   r>   r?   )rX   r9   gatesscoresorig_scoreskindsys           r/   r^   zMiniMaxSparseMoeBlock.__call__   s>   *2d122155A		!((2:..//E""$66$wAE;;;C!GD#KB???26&rDAAAEIJqw''OOAt$$	""''R'00*&&q0C&DDAr.   )	r%   r&   r'   r   rT   r5   r   r^   rk   rl   s   @r/   r   r      sj        	#Y 	# 	# 	# 	# 	# 	#"( rx        r.   r   c            	       x     e Zd Zdef fdZ	 	 d	dej        deej                 dee         dej        fdZ	 xZ
S )
MiniMaxDecoderLayerro   c                 ,   t                                                       t          |          | _        t	          |          | _        t          j        |j        |j	                  | _
        t          j        |j        |j	                  | _        d S )Nrs   )rS   rT   rn   	self_attnr   block_sparse_moerx   r   r   r   input_layernormpost_attention_layernormr   s     r/   rT   zMiniMaxDecoderLayer.__init__   s    )$// 5d ; ;!z$*:@QRRR(*
$"3)
 )
 )
%%%r.   Nr9   r   r   r   c                     ||                      |                     |          ||          z   }||                     |                     |                    z   }|S rJ   )r   r   r   r   )rX   r9   r   r   rs        r/   r^   zMiniMaxDecoderLayer.__call__   sW     t33A66eDDD%%d&C&CA&F&FGGGr.   r   r   rl   s   @r/   r   r      s        

Y 

 

 

 

 

 

 $(#	 8 rx  }	
 
       r.   r   c            	       x     e Zd Zdef fdZ	 	 d	dej        deej                 dee         dej        fdZ	 xZ
S )
MiniMaxModelro   c                 &   t                                                       t          j        j        j                  | _        fdt          j                  D             | _	        t          j
        j        j                  | _        d S )Nc                 0    g | ]}t                     S ))ro   )r   ).0_ro   s     r/   
<listcomp>z)MiniMaxModel.__init__.<locals>.<listcomp>   s2     
 
 
/0T***
 
 
r.   rs   )rS   rT   rx   	Embeddingr   r   embed_tokensranger   layersr   r   rG   r   s    `r/   rT   zMiniMaxModel.__init__   s    L$:JKK
 
 
 
49$:P4Q4Q
 
 
 Jt/T5FGGG			r.   Ninputsr   r   r   c                    |                      |          }|d gt          | j                  z  }t          ||d                   }t	          | j        |          D ]\  }} ||||          }|                     |          S )Nr   )r   lenr   r   ziprG   )rX   r   r   r   hlayercs          r/   r^   zMiniMaxModel.__call__   s     f%%=FS---E$Qa11DK// 	" 	"HE1aq!!AAyy||r.   r   r   rl   s   @r/   r   r      s        HY H H H H H H $(#	  rx  }	
 
       r.   r   c                        e Zd Zdef fdZ	 	 ddej        deej                 dee         fdZ	d Z
dd	eej        j                 fd
Zed             Zed             Zed             Z xZS )Modelro   c                     t                                                       || _        |j        | _        t	          |          | _        |j        s(t          j        |j	        |j
        d          | _        d S d S r   )rS   rT   ro   r   r   modelr    rx   ry   r   r   lm_headr   s     r/   rT   zModel.__init__   sq    	/!$''
' 	T9T%5tUSSSDLLL	T 	Tr.   Nr   r   r   c                     |                      |||          }| j        j        r | j         j                            |          }n|                     |          }|S )N)r   r   r   )r   ro   r    r   	as_linearr   )rX   r   r   r   outs        r/   r^   zModel.__call__  sX     jjTj??9( 	$*)33C88CC,,s##C
r.   c                    d }i }                                 D ]D\  }}d|v r2|}|                    dd          }|         } |||          }|||<   ;||vr|||<   E|dvrS t          | j        j                  D ]u}	d|	 ddd	d
}
|
                                 D ]S\  } d dv rDfdt          | j        j                  D             }t          j        |           d| d<   TvS )z3Dequantize FP8 weights and restructure MoE experts.c                    t           j        }t          j        | t           j                  } d}| j        \  }}| |z  }| |z  }t          j        | d|fd|ff          } |                     ||z   |z  |||z   |z  |f          } | |d d d d d d f         z                      ||z   ||z             } | d |d |f                             |          S )N)rC      r   )r5   bfloat16from_fp8rA   padr   r4   )rW   	scale_invrC   bsmn
pad_bottompad_sides           r/   dequantzModel.sanitize.<locals>.dequant  s    KE[r{;;;FB<DAq"JbyHVFa_q(m$DEEF^^j.R'a(lr-A2F F yD!!!T)9::CCJH F "1"bqb&>((///r.   weight_scale_inv
_scale_inv z3model.layers.0.block_sparse_moe.experts.0.w1.weightzmodel.layers.	gate_proj	down_projup_proj)w1w2w3z.block_sparse_moe.experts.0..weightc           
      L    g | ] }                      d | d d          !S )z.block_sparse_moe.experts..r   )pop)r   e	orig_nameprefixweightss     r/   r   z"Model.sanitize.<locals>.<listcomp>=  sU          %WWWWYWWW   r.   z.block_sparse_moe.switch_mlp.)itemsreplacer   ro   r   r   r5   stack)rX   r   r   new_weightsr   vr   wkrW   lmappingnew_nameto_joinr   r   s    `           @@r/   sanitizezModel.sanitize  s   	0 	0 	0" MMOO 	# 	#DAq!Q&&	YY|R00  33"(B+%%!"A AOONty233 	* 	*A(Q((F(9MMG'.}} 
* 
*#	8LL)LLLPWWW      "'ty'B!C!C	  G )) !QQQQQ
* r.   r=   c                    |pt           j                                        }|                                }|                                }| j        j        D ]}t          |j        j	        d|          |j        _	        t          |j        j
        d|          |j        _
        t          |j        j        d|          |j        _        t          |j        j        d|          |j        _        |j        j        r`t                              |j        j        |          |j        _        t                              |j        j        |          |j        _        |j        xj        |z  c_        |j        xj        |z  c_        t)          |j        j        j        d|           t)          |j        j        j        d|           t)          |j        j        j        d|           ||j        _        d S )Nzall-to-shardedr<   zsharded-to-all)r5   r>   rU   rB   rd   r   r   r	   r   rz   r{   r|   r}   r$   rO   rh   r   r   r   r   r   r   r   r   r   r   r   )rX   r=   Nrd   r   s        r/   shardzModel.shardI  s   .,,..JJLLzz||Z& )	: )	:E%1&(8& & &EO" &2&(8& & &EO" &2&(8& & &EO" &2&(8& & &EO" * )7)E)EO*% *F * *& *8)E)EO*% *F * *& O//A5//O//A5// &1;    
 &1;    
 &19    
 5:E"11S)	: )	:r.   c                     | j         j        S rJ   )r   r   r[   s    r/   r   zModel.layersx  s    z  r.   c                     d }|S )Nc                 
    d| vS )Nr   r-   )r   s    r/   	predicatez'Model.cast_predicate.<locals>.predicate~  s    ,A55r.   r-   rX   r   s     r/   cast_predicatezModel.cast_predicate|  s    	6 	6 	6 r.   c                     d }|S )Nc                 :    |                      d          rdddS dS )Nzblock_sparse_moe.gate@      )
group_sizebitsT)endswith)pathr   s     r/   r   z(Model.quant_predicate.<locals>.predicate  s*    }}455 5&(!4444r.   r-   r   s     r/   quant_predicatezModel.quant_predicate  s    	 	 	
 r.   r   rJ   )r%   r&   r'   r   rT   r5   r   r   r   r^   r   r>   ri   r   propertyr   r  r
  rk   rl   s   @r/   r   r      s!       TY T T T T T T $(#	  rx  }	   3 3 3j-: -:8BN$89 -: -: -: -:^ ! ! X!   X   X    r.   r   ) dataclassesr   	functoolsr   typingr   r   r   mlx.corecorer5   mlx.nnrx   mlx.nn.layers.distributedr   r	   r
   ru   r   r   r   switch_layersr   r   rM   ModulerO   rn   r   r   r   r   r-   r.   r/   <module>r     s2   " ! ! ! ! !       & & & & & & & & & &             P P P P P P P P P P T T T T T T T T T T $ $ $ $ $ $        * # # #(    RY   8I# I# I# I# I#ry I# I# I#X# # # # #BI # # #L    ")   0    29   <L L L L LBI L L L L Lr.   