
    )jL,                        d dl mZ d dlmZmZmZmZmZ d dlm	Z
 d dlmZ d dlmZ ddlmZ ddlmZmZmZ ddlmZmZ dd	lmZ dd
lmZ e G d de                      ZdedefdZ G d dej                   Z! G d dej                   Z" G d dej                   Z# G d deej                   Z$ G d dej                   Z%dS )    )	dataclass)AnyDictListOptionalUnionN)shard_linear   )swiglu)BaseModelArgscreate_attention_maskscaled_dot_product_attention)KVCacheRotatingKVCache)PipelineMixin)initialize_ropec                   2   e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   dZee         ed	<   dZ	ee         ed
<   dZ
ee         ed<   dZeeeeeef         f                  ed<   dZeed<   dZeee                  ed<   dZee         ed<   d ZdS )	ModelArgs
model_typehidden_sizenum_hidden_layersintermediate_sizenum_attention_headsrms_norm_eps
vocab_sizeNhead_dimmax_position_embeddingsnum_key_value_headsrope_parametersTtie_word_embeddingslayer_typessliding_windowc                 ^    | j         | j        | _         | j        dg| j        z  | _        d S d S )Nfull_attention)r   r   r!   r   selfs    b/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/ministral3.py__post_init__zModelArgs.__post_init__"   s@    #+'+'?D$# 01D4JJD $#    )__name__
__module____qualname__str__annotations__intfloatr   r   r   r   r   r   r   r    boolr!   r   r"   r(    r)   r'   r   r      s        OOOOOO"Hhsm"""-1Xc]111)-#--->BOXd3eSj(9#9:;BBB $$$$'+K$s)$+++$(NHSM(((K K K K Kr)   r   betar   c           
      @   t          |t          j                  r|j        dk    r|d d d f         }d|t          j        dt          j        t          j        |           |z   |z            z             z  z   }|j        dk    r|d d d d d d f         S |d d d f         S )Nr   r
      )
isinstancemxarrayndimlogfloorarange)sizeoffsetr3   r   scalings        r'   _get_llama_4_attn_scaler@   *   s    &"(## !a4$	BHbioo.2IIJJJ   G |qqqq$4'((qqq$wr)   c                        e Zd Zdef fdZ	 	 d
dej        dej        deej                 dee         dej        f
d	Z	 xZ
S )	Attentionargsc                 .   t                                                       |j        }|j        x| _        }|j        x| _        }|j        p	|j        |z  x| _        }|dz  | _        t          j
        |||z  d          | _        t          j
        |||z  d          | _        t          j
        |||z  d          | _        t          j
        ||z  |d          | _        t          | j        |j        d         d|j        |j                  | _        d S )Ng      Fbias
rope_theta)super__init__r   r   n_headsr   
n_kv_headsr   scalennLinearq_projk_projv_projo_projr   r   r   rope)r&   rC   dimrJ   rK   r   	__class__s         r'   rI   zAttention.__init__8   s   !%!99w'+'??*#'=#OD4D4OOt^
iWx%7eDDDiZ(%:GGGiZ(%:GGGi( 2CeDDD#M . (
 
			r)   Nx
attn_scalemaskcachereturnc                    |j         \  }}}|                     |          |                     |          |                     |          }
}	}|                    ||| j        d                              dddd          }|	                    ||| j        d                              dddd          }	|
                    ||| j        d                              dddd          }
d}|O|j        }| 	                    ||          }| 	                    |	|          }	|
                    |	|
          \  }	}
n*| 	                    |          }| 	                    |	          }	||z  }t          ||	|
|| j        |          }|                    dddd                              ||d          }|                     |          S )Nr   r5   r
      )r>   )rY   rL   rX   )shaperO   rP   rQ   reshaperJ   	transposerK   r>   rS   update_and_fetchr   rL   rR   )r&   rV   rW   rX   rY   BLDquerieskeysvaluesr>   outputs                r'   __call__zAttention.__call__P   s    '1a $AAAv //!Qb99CCAq!QOO||Aq$/266@@Aq!LL1dor::DDQ1aPP\Fiii77G99T&911D 11$??LD&&ii((G99T??DJ&-T6djt
 
 
 !!!Q1--55aB??{{6"""r)   NN)r*   r+   r,   r   rI   r7   r8   r   r   ri   __classcell__rU   s   @r'   rB   rB   7   s        
Y 
 
 
 
 
 
8 $(## #8# H# rx 	#
 }# 
# # # # # # # #r)   rB   c                   :     e Zd Zdef fdZdej        fdZ xZS )MLPrC   c                    t                                                       |j        }|j        }t	          j        ||d          | _        t	          j        ||d          | _        t	          j        ||d          | _        d S NFrE   )	rH   rI   r   r   rM   rN   	gate_proj	down_projup_proj)r&   rC   rT   
hidden_dimrU   s       r'   rI   zMLP.__init__s   su    +
3
???:s???yju===r)   rZ   c                     |                      t          |                     |          |                     |                              S N)rr   r   rq   rs   )r&   rV   s     r'   ri   zMLP.__call__|   s4    ~~fT^^A%6%6QHHIIIr)   )	r*   r+   r,   r   rI   r7   r8   ri   rk   rl   s   @r'   rn   rn   r   sj        >Y > > > > > >JRX J J J J J J J Jr)   rn   c                        e Zd Zddedef fdZ	 	 ddej        dej        deej                 d	ee	         d
ej        f
dZ
 xZS )TransformerBlockFrC   use_slidingc                 x   t                                                       |j        | _        |j        | _        || _        t          |          | _        t          |          | _        t          j
        |j        |j                  | _        t          j
        |j        |j                  | _        || _        d S )Neps)rH   rI   r   r   ry   rB   	self_attnrn   mlprM   RMSNormr   input_layernormpost_attention_layernormrC   )r&   rC   ry   rU   s      r'   rI   zTransformerBlock.__init__   s    #'#; +&"4t99!z$*:@QRRR(*
$"3)
 )
 )
% 			r)   NrV   rW   rX   rY   rZ   c                     |                      |                     |          |||          }||z   }|                     |                     |                    }||z   }|S rv   )r}   r   r~   r   )r&   rV   rW   rX   rY   rhouts           r'   ri   zTransformerBlock.__call__   s_     NN4//22JeLLEHHT2215566!e
r)   )Frj   )r*   r+   r,   r   r1   rI   r7   r8   r   r   ri   rk   rl   s   @r'   rx   rx      s         Y T      " $(# 8 H rx 	
 } 
       r)   rx   c                   d     e Zd Zdef fdZ fdZ	 	 ddej        deej                 fdZ	 xZ
S )	LanguageModelrC   c                    t                                                       | _        j        | _        j        | _        j        | _        j        | _        t          j        j        j	                  | _
        fd| j        D             | _        t          j        j	        j                  | _        | j                            d          | _        d | _        t%          | j                  D ]\  }}|j        r
|| _         d S d S )Nc                 :    g | ]}t          |d k              S )sliding_attention)rC   ry   )rx   ).0
layer_typerC   s     r'   
<listcomp>z*LanguageModel.__init__.<locals>.<listcomp>   s<     
 
 
 $JBU4UVVV
 
 
r)   r{   r$   )rH   rI   rC   r   r   r!   r"   rM   	Embeddingr   embed_tokenslayersr   r   normindexfa_idxswa_idx	enumeratery   )r&   rC   elrU   s    `  r'   rI   zLanguageModel.__init__   s   	/!%!7+"1L$:JKK
 
 
 
".
 
 
 Jt/T5FGGG	&,,-=>>dk** 	 	DAq}  	 	r)   c                    t                                          |           d | _        d | _        t	          | j                  D ]A\  }}| j        |j        r|| _        n| j        |j        s|| _        | j        
| j         d S Bd S rv   )rH   pipeliner   r   r   pipeline_layersry   )r&   groupr   r   rU   s       r'   r   zLanguageModel.pipeline   s    d233 	 	DAq|## $Q]${&4<+C	 	r)   Ninputsinput_embeddingsc                    ||}n|                      |          }| j        }| j        }|d gt          | j                  z  }d}n|d         j        }d x}}	| j        t          ||| j                           }	| j        "t          ||| j                 | j	                  }t          |j        d         || j        j        d         | j        j        d                                       |j                  }
||dz
  k     r#t           j                            ||dz             }t'          | j        |          D ]\  }}|j        r|n|	} |||
||          } |dk    rYt           j                            ||dz
  |z            }|d         +t!          j        |d         j        |          |d         _        |dk    r2t           j                            |          d |j        d                  }|                     |          S )Nr   )window_sizer
   llama_4_scaling_beta original_max_position_embeddings)rY   r\   )r   pipeline_rankpipeline_sizelenr   r>   r   r   r   r"   r@   r^   rC   r   astypedtyper7   distributed	recv_likezipry   senddependsrf   
all_gatherr   )r&   r   rY   r   r   r   r   r>   swa_maskfa_maskrW   r   crX   s                 r'   ri   zLanguageModel.__call__   s    ' AA!!&))A**=FS!5666EFF1X_F!!7;"+AuT[/ABBG<#,5&D4G  H -LOI%&<=I%&HI	
 

 &// 	 =1,,,((]Q->@@A,e44 	0 	0DAq }988'D!ZQ///AA A##A(9]'JKKARy$!#E"INA!>!>b	 1))!,,\qwqz\:Ayy||r)   rj   )r*   r+   r,   r   rI   r   r7   r8   r   ri   rk   rl   s   @r'   r   r      s        Y      (
 
 
 
 
 /3	5 55 #28,	5 5 5 5 5 5 5 5r)   r   c                        e Zd Zdef fdZ	 	 ddej        deej                 fdZd Z	ddeej
        j                 fd	Zed
             Zd Z xZS )ModelrC   c                     t                                                       || _        |j        | _        t	          |          | _        |j        s(t          j        |j	        |j
        d          | _        d S d S rp   )rH   rI   rC   r   r   modelr    rM   rN   r   r   lm_head)r&   rC   rU   s     r'   rI   zModel.__init__   sq    	/"4((
' 	T9T%5tUSSSDLLL	T 	Tr)   Nr   r   c                     |                      |||          }| j        j        r | j         j                            |          }n|                     |          }|S rv   )r   rC   r    r   	as_linearr   )r&   r   rY   r   r   s        r'   ri   zModel.__call__   sW     jj(8999( 	$*)33C88CC,,s##C
r)   c                 8   d |                                 D             }| j        j        r|                    dd            i }|                                 D ]@\  }}d|v r)|}|                    dd          }||         }||z  ||<   2d|v r7||vr|||<   A|}|S )Nc                 "    i | ]\  }}d |v	||S )zself_attn.rotary_emb.inv_freqr2   )r   kvs      r'   
<dictcomp>z"Model.sanitize.<locals>.<dictcomp>  s1     
 
 
Q0OWX0X0XAq0X0X0Xr)   zlm_head.weightweight_scale_inv
_scale_inv activation_scale)itemsrC   r    popreplace)r&   weightsnew_weightsr   r   	scale_invwkweights           r'   sanitizezModel.sanitize  s    
 
$]]__
 
 
 9( 	0KK($///MMOO 		# 		#DAq!Q&&	YY|R00 "(9"4B#q((+%%!"Ar)   r   c                    |pt           j                                        }|                                }| j        j        D ]7}t          |j        j        d|          |j        _        t          |j        j	        d|          |j        _	        t          |j        j
        d|          |j        _
        t          |j        j        d|          |j        _        |j        xj        |z  c_        |j        xj        |z  c_        t          |j        j        d|          |j        _        t          |j        j        d|          |j        _        t          |j        j        d|          |j        _        9d S )Nzall-to-sharded)r   zsharded-to-all)r7   r   initr=   r   r   r	   r}   rO   rP   rQ   rR   rJ   rK   r~   rq   rr   rs   )r&   r   Nlayers       r'   shardzModel.shard"  s   .,,..JJLLZ& 	 	E%1&(8& & &EO" &2&(8& & &EO" &2&(8& & &EO" &2&(8& & &EO" O##)##O&&1,&& #/	#%5U# # #EI #/	#%5U# # #EI !-	!#35! ! !EI1	 	r)   c                     | j         j        S rv   )r   r   r%   s    r'   r   zModel.layersA  s    z))r)   c                 *      fd j         D             S )Nc                 n    g | ]1}|j         rt          j        j                   nt	                      2S ))max_size)ry   r   r   r"   r   )r   r   r&   s     r'   r   z$Model.make_cache.<locals>.<listcomp>F  sO     
 
 
  $)BCCCCYY	
 
 
r)   )r   r%   s   `r'   
make_cachezModel.make_cacheE  s2    
 
 
 
 
 
 
 	
r)   rj   rv   )r*   r+   r,   r   rI   r7   r8   r   ri   r   r   Groupr   propertyr   r   rk   rl   s   @r'   r   r      s        TY T T T T T T /3	  #28,	     . 8BN$89    > * * X*
 
 
 
 
 
 
r)   r   )&dataclassesr   typingr   r   r   r   r   mlx.corecorer7   mlx.nnrM   mlx.nn.layers.distributedr	   activationsr   baser   r   r   rY   r   r   r   r   
rope_utilsr   r   r0   r/   r@   ModulerB   rn   rx   r   r   r2   r)   r'   <module>r      sJ   " ! ! ! ! ! 3 3 3 3 3 3 3 3 3 3 3 3 3 3             2 2 2 2 2 2       T T T T T T T T T T + + + + + + + + # # # # # # ' ' ' ' ' ' K K K K K K K K0
  
 PS 
  
  
  
 8# 8# 8# 8# 8#	 8# 8# 8#vJ J J J J") J J J    ry   8V V V V VM29 V V VrX
 X
 X
 X
 X
BI X
 X
 X
 X
 X
r)   